mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
remove some more metrics
This commit is contained in:
@@ -21,7 +21,7 @@ use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{QueryError, is_expected_io_error};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
|
||||
use strum::{IntoEnumIterator as _, VariantNames};
|
||||
use strum_macros::{IntoStaticStr, VariantNames};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -191,39 +191,6 @@ pub(crate) struct ScanLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
|
||||
|
||||
pub(crate) struct PageCacheSizeMetrics {
|
||||
pub max_bytes: UIntGauge,
|
||||
|
||||
pub current_bytes_immutable: UIntGauge,
|
||||
}
|
||||
|
||||
static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_page_cache_size_current_bytes",
|
||||
"Current size of the page cache in bytes, by key kind",
|
||||
&["key_kind"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
||||
Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
#[derive(IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum PageCacheErrorKind {
|
||||
@@ -272,44 +239,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
|
||||
|
||||
pub(crate) mod wait_ondemand_download_time {
|
||||
use super::*;
|
||||
const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
|
||||
0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms
|
||||
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s
|
||||
1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s
|
||||
10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m
|
||||
];
|
||||
|
||||
/// The task kinds for which we want to track wait times for on-demand downloads.
|
||||
/// Other task kinds' wait times are accumulated in label value `unknown`.
|
||||
pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [
|
||||
TaskKind::PageRequestHandler,
|
||||
TaskKind::WalReceiverConnectionHandler,
|
||||
];
|
||||
|
||||
pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy<Vec<Histogram>> = Lazy::new(|| {
|
||||
let histo = register_histogram_vec!(
|
||||
"pageserver_wait_ondemand_download_seconds_global",
|
||||
"Observations are individual tasks' wait times for on-demand downloads. \
|
||||
If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.",
|
||||
&["task_kind"],
|
||||
WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
|
||||
.iter()
|
||||
.map(|task_kind| histo.with_label_values(&[task_kind.into()]))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy<CounterVec> = Lazy::new(|| {
|
||||
register_counter_vec!(
|
||||
// use a name that _could_ be evolved into a per-timeline histogram later
|
||||
"pageserver_wait_ondemand_download_seconds_sum",
|
||||
"Like `pageserver_wait_ondemand_download_seconds_global` but per timeline",
|
||||
&["tenant_id", "shard_id", "timeline_id", "task_kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub struct WaitOndemandDownloadTimeSum {
|
||||
}
|
||||
@@ -324,19 +253,10 @@ pub(crate) mod wait_ondemand_download_time {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) {
|
||||
for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS {
|
||||
let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
task_kind.into(),
|
||||
]);
|
||||
}
|
||||
pub(crate) fn shutdown_timeline(_tenant_id: &str, _shard_id: &str, _timeline_id: &str) {
|
||||
}
|
||||
|
||||
pub(crate) fn preinitialize_global_metrics() {
|
||||
Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -588,119 +508,15 @@ pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{IntCounter, IntCounterVec, register_int_counter, register_int_counter_vec};
|
||||
use metrics::{IntCounter, register_int_counter};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) struct StartCalculation(IntCounterVec);
|
||||
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
|
||||
StartCalculation(
|
||||
register_int_counter_vec!(
|
||||
"pageserver_initial_logical_size_start_calculation",
|
||||
"Incremented each time we start an initial logical size calculation attempt. \
|
||||
The `circumstances` label provides some additional details.",
|
||||
&["attempt", "circumstances"]
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
struct DropCalculation {
|
||||
first: IntCounter,
|
||||
retry: IntCounter,
|
||||
}
|
||||
|
||||
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
|
||||
let vec = register_int_counter_vec!(
|
||||
"pageserver_initial_logical_size_drop_calculation",
|
||||
"Incremented each time we abort a started size calculation attmpt.",
|
||||
&["attempt"]
|
||||
)
|
||||
.unwrap();
|
||||
DropCalculation {
|
||||
first: vec.with_label_values(&["first"]),
|
||||
retry: vec.with_label_values(&["retry"]),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct Calculated {
|
||||
pub(crate) births: IntCounter,
|
||||
pub(crate) deaths: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
|
||||
births: register_int_counter!(
|
||||
"pageserver_initial_logical_size_finish_calculation",
|
||||
"Incremented every time we finish calculation of initial logical size.\
|
||||
If everything is working well, this should happen at most once per Timeline object."
|
||||
)
|
||||
.unwrap(),
|
||||
deaths: register_int_counter!(
|
||||
"pageserver_initial_logical_size_drop_finished_calculation",
|
||||
"Incremented when we drop a finished initial logical size calculation result.\
|
||||
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
|
||||
)
|
||||
.unwrap(),
|
||||
});
|
||||
|
||||
pub(crate) struct OngoingCalculationGuard {
|
||||
inc_drop_calculation: Option<IntCounter>,
|
||||
}
|
||||
|
||||
#[derive(strum_macros::IntoStaticStr)]
|
||||
pub(crate) enum StartCircumstances {
|
||||
EmptyInitial,
|
||||
SkippedConcurrencyLimiter,
|
||||
AfterBackgroundTasksRateLimit,
|
||||
}
|
||||
|
||||
impl StartCalculation {
|
||||
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0
|
||||
.with_label_values(&["first", circumstances_label])
|
||||
.inc();
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0
|
||||
.with_label_values(&["retry", circumstances_label])
|
||||
.inc();
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OngoingCalculationGuard {
|
||||
fn drop(&mut self) {
|
||||
if let Some(counter) = self.inc_drop_calculation.take() {
|
||||
counter.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OngoingCalculationGuard {
|
||||
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
|
||||
drop(self.inc_drop_calculation.take());
|
||||
CALCULATED.births.inc();
|
||||
FinishedCalculationGuard {
|
||||
inc_on_drop: CALCULATED.deaths.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct FinishedCalculationGuard {
|
||||
inc_on_drop: IntCounter,
|
||||
}
|
||||
|
||||
impl Drop for FinishedCalculationGuard {
|
||||
fn drop(&mut self) {
|
||||
self.inc_on_drop.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// context: https://github.com/neondatabase/neon/issues/5963
|
||||
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
|
||||
@@ -773,25 +589,6 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions_with_low_residence_duration",
|
||||
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
|
||||
Residence duration is determined using the `residence_duration_data_source`.",
|
||||
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_unexpected_ondemand_downloads_count",
|
||||
"Number of unexpected on-demand downloads. \
|
||||
We log more context for each increment, so, forgo any labels in this metric.",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// How long did we take to start up? Broken down by labels to describe
|
||||
/// different phases of startup.
|
||||
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
@@ -869,134 +666,6 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
||||
}
|
||||
});
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
counter: Option<IntCounter>,
|
||||
}
|
||||
|
||||
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDurationBuilder {
|
||||
pub fn new(data_source: &'static str, threshold: Duration) -> Self {
|
||||
Self {
|
||||
data_source,
|
||||
threshold,
|
||||
}
|
||||
}
|
||||
|
||||
fn build(
|
||||
&self,
|
||||
tenant_id: &str,
|
||||
shard_id: &str,
|
||||
timeline_id: &str,
|
||||
) -> EvictionsWithLowResidenceDuration {
|
||||
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||
.get_metric_with_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
|
||||
])
|
||||
.unwrap();
|
||||
EvictionsWithLowResidenceDuration {
|
||||
data_source: self.data_source,
|
||||
threshold: self.threshold,
|
||||
counter: Some(counter),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDuration {
|
||||
fn threshold_label_value(threshold: Duration) -> String {
|
||||
format!("{}", threshold.as_secs())
|
||||
}
|
||||
|
||||
pub fn observe(&self, observed_value: Duration) {
|
||||
if observed_value < self.threshold {
|
||||
self.counter
|
||||
.as_ref()
|
||||
.expect("nobody calls this function after `remove_from_vec`")
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn change_threshold(
|
||||
&mut self,
|
||||
tenant_id: &str,
|
||||
shard_id: &str,
|
||||
timeline_id: &str,
|
||||
new_threshold: Duration,
|
||||
) {
|
||||
if new_threshold == self.threshold {
|
||||
return;
|
||||
}
|
||||
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
|
||||
self.data_source,
|
||||
new_threshold,
|
||||
)
|
||||
.build(tenant_id, shard_id, timeline_id);
|
||||
std::mem::swap(self, &mut with_new);
|
||||
with_new.remove(tenant_id, shard_id, timeline_id);
|
||||
}
|
||||
|
||||
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
|
||||
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
|
||||
let Some(_counter) = self.counter.take() else {
|
||||
return;
|
||||
};
|
||||
|
||||
let threshold = Self::threshold_label_value(self.threshold);
|
||||
|
||||
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&threshold,
|
||||
]);
|
||||
|
||||
match removed {
|
||||
Err(e) => {
|
||||
// this has been hit in staging as
|
||||
// <https://neondatabase.sentry.io/issues/4142396994/>, but we don't know how.
|
||||
// because we can be in the drop path already, don't risk:
|
||||
// - "double-panic => illegal instruction" or
|
||||
// - future "drop panick => abort"
|
||||
//
|
||||
// so just nag: (the error has the labels)
|
||||
tracing::warn!(
|
||||
"failed to remove EvictionsWithLowResidenceDuration, it was already removed? {e:#?}"
|
||||
);
|
||||
}
|
||||
Ok(()) => {
|
||||
// to help identify cases where we double-remove the same values, let's log all
|
||||
// deletions?
|
||||
tracing::info!(
|
||||
"removed EvictionsWithLowResidenceDuration with {tenant_id}, {timeline_id}, {}, {threshold}",
|
||||
self.data_source
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Metrics collected on disk IO operations
|
||||
//
|
||||
// Roughly logarithmic scale.
|
||||
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
0.000030, // 30 usec
|
||||
0.001000, // 1000 usec
|
||||
0.030, // 30 ms
|
||||
1.000, // 1000 ms
|
||||
30.000, // 30000 ms
|
||||
];
|
||||
|
||||
/// VirtualFile fs operation variants.
|
||||
///
|
||||
@@ -1024,52 +693,6 @@ pub(crate) enum StorageIoOperation {
|
||||
Metadata,
|
||||
}
|
||||
|
||||
impl StorageIoOperation {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
StorageIoOperation::Open => "open",
|
||||
StorageIoOperation::OpenAfterReplace => "open-after-replace",
|
||||
StorageIoOperation::Close => "close",
|
||||
StorageIoOperation::CloseByReplace => "close-by-replace",
|
||||
StorageIoOperation::Read => "read",
|
||||
StorageIoOperation::Write => "write",
|
||||
StorageIoOperation::Seek => "seek",
|
||||
StorageIoOperation::Fsync => "fsync",
|
||||
StorageIoOperation::Metadata => "metadata",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tracks time taken by fs operations near VirtualFile.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct StorageIoTime {
|
||||
metrics: [Histogram; StorageIoOperation::COUNT],
|
||||
}
|
||||
|
||||
impl StorageIoTime {
|
||||
fn new() -> Self {
|
||||
let storage_io_histogram_vec = register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = StorageIoOperation::from_repr(i).unwrap();
|
||||
storage_io_histogram_vec
|
||||
.get_metric_with_label_values(&[op.as_str()])
|
||||
.unwrap()
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
|
||||
pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
|
||||
&self.metrics[op as usize]
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
#[repr(usize)]
|
||||
@@ -2190,7 +1813,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub find_gc_cutoffs_histo: StorageTimeMetrics,
|
||||
pub last_record_lsn_gauge: IntGauge,
|
||||
pub disk_consistent_lsn_gauge: IntGauge,
|
||||
pub pitr_history_size: UIntGauge,
|
||||
pub archival_size: UIntGauge,
|
||||
@@ -2203,7 +1825,6 @@ pub(crate) struct TimelineMetrics {
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
pub wal_records_received: IntCounter,
|
||||
@@ -2218,7 +1839,6 @@ impl TimelineMetrics {
|
||||
pub fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id_raw: &TimelineId,
|
||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||
@@ -2277,9 +1897,6 @@ impl TimelineMetrics {
|
||||
&shard_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let last_record_lsn_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2332,8 +1949,6 @@ impl TimelineMetrics {
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
|
||||
.build(&tenant_id, &shard_id, &timeline_id);
|
||||
|
||||
let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2373,7 +1988,6 @@ impl TimelineMetrics {
|
||||
garbage_collect_histo,
|
||||
find_gc_cutoffs_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_lsn_gauge,
|
||||
disk_consistent_lsn_gauge,
|
||||
pitr_history_size,
|
||||
archival_size,
|
||||
@@ -2385,9 +1999,6 @@ impl TimelineMetrics {
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
evictions,
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
storage_io_size,
|
||||
valid_lsn_lease_count_gauge,
|
||||
wal_records_received,
|
||||
@@ -2482,11 +2093,6 @@ impl TimelineMetrics {
|
||||
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
.unwrap()
|
||||
.remove(tenant_id, shard_id, timeline_id);
|
||||
|
||||
// The following metrics are born outside of the TimelineMetrics lifecycle but still
|
||||
// removed at the end of it. The idea is to have the metrics outlive the
|
||||
// entity during which they're observed, e.g., the smgr metrics shall
|
||||
@@ -3244,7 +2850,6 @@ pub fn preinitialize_metrics(
|
||||
|
||||
// counters
|
||||
[
|
||||
&UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
&WALRECEIVER_STARTED_CONNECTIONS,
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
|
||||
@@ -77,7 +77,6 @@ use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::PageCacheSizeMetrics;
|
||||
use crate::virtual_file::{IoBufferMut, IoPageSlice};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -206,7 +205,6 @@ pub struct PageCache {
|
||||
/// This is interpreted modulo the page cache size.
|
||||
next_evict_slot: AtomicUsize,
|
||||
|
||||
size_metrics: &'static PageCacheSizeMetrics,
|
||||
}
|
||||
|
||||
struct PinnedSlotsPermit {
|
||||
@@ -500,7 +498,6 @@ impl PageCache {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
self.size_metrics.current_bytes_immutable.sub_page_sz(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -518,7 +515,6 @@ impl PageCache {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
self.size_metrics.current_bytes_immutable.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -608,10 +604,6 @@ impl PageCache {
|
||||
// this is avoided.
|
||||
let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
size_metrics.current_bytes_immutable.set_page_sz(0);
|
||||
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
.map(|chunk| {
|
||||
@@ -633,31 +625,8 @@ impl PageCache {
|
||||
immutable_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
size_metrics,
|
||||
pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait PageSzBytesMetric {
|
||||
fn set_page_sz(&self, count: usize);
|
||||
fn add_page_sz(&self, count: usize);
|
||||
fn sub_page_sz(&self, count: usize);
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn count_times_page_sz(count: usize) -> u64 {
|
||||
u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
|
||||
}
|
||||
|
||||
impl PageSzBytesMetric for metrics::UIntGauge {
|
||||
fn set_page_sz(&self, count: usize) {
|
||||
self.set(count_times_page_sz(count));
|
||||
}
|
||||
fn add_page_sz(&self, count: usize) {
|
||||
self.add(count_times_page_sz(count));
|
||||
}
|
||||
fn sub_page_sz(&self, count: usize) {
|
||||
self.sub(count_times_page_sz(count));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1121,7 +1121,6 @@ impl LayerInner {
|
||||
"unexpectedly on-demand downloading for task kind {:?}",
|
||||
ctx.task_kind()
|
||||
);
|
||||
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
|
||||
|
||||
let really_error =
|
||||
matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
|
||||
@@ -1570,17 +1569,7 @@ impl LayerInner {
|
||||
Ok(elapsed) => {
|
||||
let accessed_and_visible = self.access_stats.accessed()
|
||||
&& self.access_stats.visibility() == LayerVisibilityHint::Visible;
|
||||
if accessed_and_visible {
|
||||
// Only layers used for reads contribute to our "low residence" metric that is used
|
||||
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
|
||||
// to be rapidly evicted without contributing to this metric.
|
||||
timeline
|
||||
.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(elapsed);
|
||||
}
|
||||
|
||||
|
||||
tracing::info!(
|
||||
residence_millis = elapsed.as_millis(),
|
||||
|
||||
@@ -2718,15 +2718,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.eviction_policy)
|
||||
}
|
||||
|
||||
fn get_evictions_low_residence_duration_metric_threshold(
|
||||
tenant_conf: &pageserver_api::models::TenantConfig,
|
||||
default_tenant_conf: &pageserver_api::config::TenantConfigToml,
|
||||
) -> Duration {
|
||||
tenant_conf
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
|
||||
}
|
||||
|
||||
fn get_image_layer_creation_check_threshold(&self) -> u8 {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2802,28 +2793,8 @@ impl Timeline {
|
||||
|
||||
// The threshold is embedded in the metric. So, we need to update it.
|
||||
{
|
||||
let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&new_conf.tenant_conf,
|
||||
&self.conf.default_tenant_conf,
|
||||
);
|
||||
|
||||
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
|
||||
let timeline_id_str = self.timeline_id.to_string();
|
||||
|
||||
self.remote_client.update_config(&new_conf.location);
|
||||
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.write()
|
||||
.unwrap()
|
||||
.change_threshold(
|
||||
&tenant_id_str,
|
||||
&shard_id_str,
|
||||
&timeline_id_str,
|
||||
new_threshold,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2857,13 +2828,6 @@ impl Timeline {
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
|
||||
let evictions_low_residence_duration_metric_threshold = {
|
||||
let loaded_tenant_conf = tenant_conf.load();
|
||||
Self::get_evictions_low_residence_duration_metric_threshold(
|
||||
&loaded_tenant_conf.tenant_conf,
|
||||
&conf.default_tenant_conf,
|
||||
)
|
||||
};
|
||||
|
||||
if let Some(ancestor) = &ancestor {
|
||||
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
|
||||
@@ -2876,10 +2840,6 @@ impl Timeline {
|
||||
let metrics = Arc::new(TimelineMetrics::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
||||
"mtime",
|
||||
evictions_low_residence_duration_metric_threshold,
|
||||
),
|
||||
));
|
||||
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
|
||||
|
||||
@@ -3016,10 +2976,6 @@ impl Timeline {
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
|
||||
result
|
||||
.metrics
|
||||
.last_record_lsn_gauge
|
||||
.set(disk_consistent_lsn.0 as i64);
|
||||
result
|
||||
})
|
||||
}
|
||||
@@ -3489,7 +3445,7 @@ impl Timeline {
|
||||
self.current_logical_size.initialized.add_permits(1);
|
||||
}
|
||||
|
||||
let try_once = |attempt: usize| {
|
||||
let try_once = |_attempt: usize| {
|
||||
let background_ctx = &background_ctx;
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
@@ -3500,7 +3456,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
use crate::metrics::initial_logical_size::StartCircumstances;
|
||||
let (_maybe_permit, circumstances) = tokio::select! {
|
||||
let (_maybe_permit, _circumstances) = tokio::select! {
|
||||
permit = wait_for_permit => {
|
||||
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
|
||||
}
|
||||
@@ -3517,12 +3473,6 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
let metrics_guard = if attempt == 1 {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
|
||||
} else {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
|
||||
};
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self_ref.conf,
|
||||
self_ref
|
||||
@@ -3549,7 +3499,7 @@ impl Timeline {
|
||||
|
||||
// TODO: add aux file size to logical size
|
||||
|
||||
Ok((calculated_size, metrics_guard))
|
||||
Ok(calculated_size)
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3586,7 +3536,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
let (calculated_size, metrics_guard) = match retrying.await {
|
||||
let calculated_size = match retrying.await {
|
||||
ControlFlow::Continue(calculated_size) => calculated_size,
|
||||
ControlFlow::Break(()) => return,
|
||||
};
|
||||
@@ -3605,8 +3555,7 @@ impl Timeline {
|
||||
|
||||
self.current_logical_size
|
||||
.initial_logical_size
|
||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
||||
.ok()
|
||||
.set((calculated_size,))
|
||||
.expect("only this task sets it");
|
||||
}
|
||||
|
||||
@@ -4503,8 +4452,6 @@ impl Timeline {
|
||||
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
assert!(new_lsn.is_aligned());
|
||||
|
||||
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ pub(super) struct LogicalSize {
|
||||
/// the initial size at a different LSN.
|
||||
pub initial_logical_size: OnceCell<(
|
||||
u64,
|
||||
crate::metrics::initial_logical_size::FinishedCalculationGuard,
|
||||
)>,
|
||||
|
||||
/// Cancellation for the best-effort logical size calculation.
|
||||
@@ -130,11 +129,7 @@ impl CurrentLogicalSize {
|
||||
impl LogicalSize {
|
||||
pub(super) fn empty_initial() -> Self {
|
||||
Self {
|
||||
initial_logical_size: OnceCell::with_value((0, {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION
|
||||
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
|
||||
.calculation_result_saved()
|
||||
})),
|
||||
initial_logical_size: OnceCell::with_value((0,)),
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||
initial_part_end: None,
|
||||
size_added_after_initial: AtomicI64::new(0),
|
||||
@@ -159,7 +154,7 @@ impl LogicalSize {
|
||||
// ^^^ keep this type explicit so that the casts in this function break if
|
||||
// we change the type.
|
||||
match self.initial_logical_size.get() {
|
||||
Some((initial_size, _)) => {
|
||||
Some((initial_size, )) => {
|
||||
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
|
||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||
.unwrap()))
|
||||
@@ -181,7 +176,7 @@ impl LogicalSize {
|
||||
/// available for re-use. This doesn't contain the incremental part.
|
||||
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||
match self.initial_part_end {
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,12 +27,10 @@ use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
|
||||
|
||||
use crate::assert_u64_eq_usize::UsizeIsU64;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
|
||||
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::{
|
||||
@@ -431,9 +429,7 @@ impl OpenFiles {
|
||||
if let Some(old_file) = slot_guard.file.take() {
|
||||
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
|
||||
// distinguish the two.
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::CloseByReplace)
|
||||
.observe_closure_duration(|| drop(old_file));
|
||||
drop(old_file);
|
||||
}
|
||||
|
||||
// Prepare the slot for reuse and return it
|
||||
@@ -532,13 +528,9 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
|
||||
/// where "support" means that we measure wall clock time.
|
||||
macro_rules! observe_duration {
|
||||
($op:expr, $($body:tt)*) => {{
|
||||
let instant = Instant::now();
|
||||
let result = $($body)*;
|
||||
let elapsed = instant.elapsed().as_secs_f64();
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get($op)
|
||||
.observe(elapsed);
|
||||
result
|
||||
|
||||
$($body)*
|
||||
|
||||
}}
|
||||
}
|
||||
|
||||
@@ -1263,9 +1255,7 @@ impl Drop for VirtualFileInner {
|
||||
// there is also operation "close-by-replace" for closes done on eviction for
|
||||
// comparison.
|
||||
if let Some(fd) = slot_guard.file.take() {
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Close)
|
||||
.observe_closure_duration(|| drop(fd));
|
||||
drop(fd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user