From 6861259be7ee63f6a4bb2a9fdb5546147bf20389 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 26 Apr 2023 15:18:26 +0200 Subject: [PATCH] add global metric for unexpected on-demand downloads (#4069) Until we have toned down the prod logs to zero WARN and ERROR, we want a dedicated metric for which we can have a dedicated alert. fixes https://github.com/neondatabase/neon/issues/3924 --- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/lib.rs | 2 ++ pageserver/src/metrics.rs | 16 ++++++++++++++++ pageserver/src/tenant/timeline.rs | 3 ++- test_runner/fixtures/metrics.py | 1 + 5 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ed23a18ee0..8e4897c09c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -226,6 +226,7 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION); set_launch_timestamp_metric(launch_ts); + pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 278658eba3..04863886cb 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -44,6 +44,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); +pub use crate::metrics::preinitialize_metrics; + pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d6978a8cf6..deb20f21f8 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -205,6 +205,15 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_unexpected_ondemand_downloads_count", + "Number of unexpected on-demand downloads. \ + We log more context for each increment, so, forgo any labels in this metric.", + ) + .expect("failed to define a metric") +}); + /// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric. #[derive(Debug)] pub struct EvictionsWithLowResidenceDuration { @@ -1132,3 +1141,10 @@ impl>, O, E> Future for MeasuredRemoteOp { poll_result } } + +pub fn preinitialize_metrics() { + // We want to alert on this metric increasing. + // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. + assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); + UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b8b1f963e5..6c34f5a5b5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -48,7 +48,7 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; @@ -2355,6 +2355,7 @@ impl Timeline { id, ctx.task_kind() ); + UNEXPECTED_ONDEMAND_DOWNLOADS.inc(); timeline.download_remote_layer(remote_layer).await?; continue 'layer_map_search; } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 5fed6fcf84..0e958ddd06 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -53,6 +53,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_global_count", "pageserver_storage_operations_seconds_global_sum", "pageserver_storage_operations_seconds_global_bucket", + "pageserver_unexpected_ondemand_downloads_count_total", "libmetrics_launch_timestamp", "libmetrics_build_info", "libmetrics_tracing_event_count_total",