mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
add metrics to detect eviction-induced thrashing (#3837)
This patch adds two metrics that will enable us to detect *thrashing* of layers, i.e., repetitions of `eviction, on-demand-download, eviction, ... ` for a given layer. The first metric counts all layer evictions per timeline. It requires no further explanation. The second metric counts the layer evictions where the layer was resident for less than a given threshold. We can alert on increments to the second metric. The first metric will serve as a baseline, and further, it's generally interesting, outside of thrashing. The second metric's threshold is configurable in PageServerConf and defaults to 24h. The threshold value is reproduced as a label in the metric because the counter's value is semantically tied to that threshold. Since changes to the config and hence the label value are infrequent, this will have low storage overhead in the metrics storage. The data source to determine the time that the layer was resident is the file's `mtime`. Using `mtime` is more of a crutch. It would be better if Pageserver did its own persistent bookkeeping of residence change events instead of relying on the filesystem. We had some discussion about this: https://github.com/neondatabase/neon/pull/3809#issuecomment-1470448900 My position is that `mtime` is good enough for now. It can theoretically jump forward if someone copies files without resetting `mtime`. But that shouldn't happen in practice. Note that moving files back and forth doesn't change `mtime`, nor does `chown` or `chmod`. Lastly, `rsync -a`, which is typically used for filesystem-level backup / restore, correctly syncs `mtime`. I've added a label that identifies the data source to keep options open for a future, better data source than `mtime`. Since this value will stay the same for the time being, it's not a problem for metrics storage. refs https://github.com/neondatabase/neon/issues/3728
This commit is contained in:
committed by
GitHub
parent
fea4b5f551
commit
881356c417
@@ -61,6 +61,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
@@ -89,6 +90,8 @@ pub mod defaults {
|
||||
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -170,6 +173,9 @@ pub struct PageServerConf {
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
// See the corresponding metric's help string.
|
||||
pub evictions_low_residence_duration_metric_threshold: Duration,
|
||||
|
||||
pub test_remote_failures: u64,
|
||||
|
||||
pub ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
@@ -240,6 +246,8 @@ struct PageServerConfigBuilder {
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
|
||||
evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
|
||||
|
||||
test_remote_failures: BuilderValue<u64>,
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
|
||||
@@ -293,6 +301,11 @@ impl Default for PageServerConfigBuilder {
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration(
|
||||
DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
|
||||
ondemand_download_behavior_treat_error_as_warn: Set(false),
|
||||
@@ -408,6 +421,10 @@ impl PageServerConfigBuilder {
|
||||
self.test_remote_failures = BuilderValue::Set(fail_first);
|
||||
}
|
||||
|
||||
pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) {
|
||||
self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn ondemand_download_behavior_treat_error_as_warn(
|
||||
&mut self,
|
||||
ondemand_download_behavior_treat_error_as_warn: bool,
|
||||
@@ -481,6 +498,11 @@ impl PageServerConfigBuilder {
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
evictions_low_residence_duration_metric_threshold: self
|
||||
.evictions_low_residence_duration_metric_threshold
|
||||
.ok_or(anyhow!(
|
||||
"missing evictions_low_residence_duration_metric_threshold"
|
||||
))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
@@ -670,6 +692,7 @@ impl PageServerConf {
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
"evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
|
||||
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
@@ -810,6 +833,10 @@ impl PageServerConf {
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
|
||||
)
|
||||
.unwrap(),
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
}
|
||||
@@ -951,6 +978,9 @@ metric_collection_interval = '222 s'
|
||||
cached_metric_collection_interval = '22200 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
|
||||
evictions_low_residence_duration_metric_threshold = '444 s'
|
||||
|
||||
log_format = 'json'
|
||||
|
||||
"#;
|
||||
@@ -1005,6 +1035,9 @@ log_format = 'json'
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||
)?,
|
||||
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
|
||||
defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
|
||||
)?,
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
},
|
||||
@@ -1056,6 +1089,7 @@ log_format = 'json'
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
|
||||
test_remote_failures: 0,
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
},
|
||||
|
||||
@@ -194,6 +194,93 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions",
|
||||
"Number of layers evicted from the pageserver",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_evictions_with_low_residence_duration",
|
||||
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
|
||||
Residence duration is determined using the `residence_duration_data_source`.",
|
||||
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
counter: Option<IntCounter>,
|
||||
}
|
||||
|
||||
pub struct EvictionsWithLowResidenceDurationBuilder {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDurationBuilder {
|
||||
pub fn new(data_source: &'static str, threshold: Duration) -> Self {
|
||||
Self {
|
||||
data_source,
|
||||
threshold,
|
||||
}
|
||||
}
|
||||
|
||||
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
|
||||
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||
.get_metric_with_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
|
||||
])
|
||||
.unwrap();
|
||||
EvictionsWithLowResidenceDuration {
|
||||
data_source: self.data_source,
|
||||
threshold: self.threshold,
|
||||
counter: Some(counter),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EvictionsWithLowResidenceDuration {
|
||||
fn threshold_label_value(threshold: Duration) -> String {
|
||||
format!("{}", threshold.as_secs())
|
||||
}
|
||||
|
||||
pub fn observe(&self, observed_value: Duration) {
|
||||
if self.threshold < observed_value {
|
||||
self.counter
|
||||
.as_ref()
|
||||
.expect("nobody calls this function after `remove_from_vec`")
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
|
||||
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
|
||||
let Some(_counter) = self.counter.take() else {
|
||||
return;
|
||||
};
|
||||
EVICTIONS_WITH_LOW_RESIDENCE_DURATION
|
||||
.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
self.data_source,
|
||||
&Self::threshold_label_value(self.threshold),
|
||||
])
|
||||
.expect("we own the metric, no-one else should remove it");
|
||||
}
|
||||
}
|
||||
|
||||
// Metrics collected on disk IO operations
|
||||
//
|
||||
// Roughly logarithmic scale.
|
||||
@@ -510,10 +597,16 @@ pub struct TimelineMetrics {
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
pub persistent_bytes_written: IntCounter,
|
||||
pub evictions: IntCounter,
|
||||
pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
pub fn new(
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||
) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
@@ -550,6 +643,11 @@ impl TimelineMetrics {
|
||||
let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
@@ -569,6 +667,8 @@ impl TimelineMetrics {
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
evictions,
|
||||
evictions_with_low_residence_duration,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -585,7 +685,9 @@ impl Drop for TimelineMetrics {
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
self.evictions_with_low_residence_duration
|
||||
.remove(tenant_id, timeline_id);
|
||||
for op in STORAGE_TIME_OPERATIONS {
|
||||
let _ =
|
||||
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -620,7 +722,7 @@ use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Instant;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
|
||||
@@ -1051,6 +1051,22 @@ impl Timeline {
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
|
||||
let local_layer_mtime = local_layer
|
||||
.local_path()
|
||||
.expect("local layer should have a local path")
|
||||
.metadata()
|
||||
.context("get local layer file stat")?
|
||||
.modified()
|
||||
.context("get mtime of layer file")?;
|
||||
let local_layer_residence_duration =
|
||||
match SystemTime::now().duration_since(local_layer_mtime) {
|
||||
Err(e) => {
|
||||
warn!("layer mtime is in the future: {}", e);
|
||||
None
|
||||
}
|
||||
Ok(delta) => Some(delta),
|
||||
};
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(layer_file_size);
|
||||
|
||||
let new_remote_layer = Arc::new(match local_layer.filename() {
|
||||
@@ -1093,6 +1109,14 @@ impl Timeline {
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
|
||||
self.metrics.evictions.inc();
|
||||
|
||||
if let Some(delta) = local_layer_residence_duration {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.observe(delta);
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
Replacement::NotFound => {
|
||||
@@ -1208,7 +1232,14 @@ impl Timeline {
|
||||
ancestor_timeline: ancestor,
|
||||
ancestor_lsn: metadata.ancestor_lsn(),
|
||||
|
||||
metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
|
||||
metrics: TimelineMetrics::new(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
||||
"mtime",
|
||||
conf.evictions_low_residence_duration_metric_threshold,
|
||||
),
|
||||
),
|
||||
|
||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||
|
||||
|
||||
@@ -78,5 +78,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"pageserver_tenant_states_count",
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user