diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 6011713c8f..f24488b19d 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -23,6 +23,7 @@ use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub mod metric_vec_duration; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs new file mode 100644 index 0000000000..840f60f19b --- /dev/null +++ b/libs/metrics/src/metric_vec_duration.rs @@ -0,0 +1,23 @@ +//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec. + +use std::{future::Future, time::Instant}; + +pub trait DurationResultObserver { + fn observe_result(&self, res: &Result, duration: std::time::Duration); +} + +pub async fn observe_async_block_duration_by_result< + T, + E, + F: Future>, + O: DurationResultObserver, +>( + observer: &O, + block: F, +) -> Result { + let start = Instant::now(); + let result = block.await; + let duration = start.elapsed(); + observer.observe_result(&result, duration); + result +} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 43d06db6d8..b7fdc65a00 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, @@ -424,6 +425,27 @@ pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub struct BasebackupQueryTime(HistogramVec); +pub static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { + BasebackupQueryTime({ + register_histogram_vec!( + "pageserver_basebackup_query_seconds", + "Histogram of basebackup queries durations, by result type", + &["result"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") + }) +}); + +impl DurationResultObserver for BasebackupQueryTime { + fn observe_result(&self, res: &Result, duration: std::time::Duration) { + let label_value = if res.is_ok() { "ok" } else { "error" }; + let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); + metric.observe(duration.as_secs_f64()); + } +} + pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_live_connections", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 31ad45790c..d32518b513 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -913,10 +913,24 @@ where None }; - // Check that the timeline exists - self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + metrics::metric_vec_duration::observe_async_block_duration_by_result( + &*crate::metrics::BASEBACKUP_QUERY_TIME, + async move { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + anyhow::Ok(()) + }, + ) + .await?; } // return pair of prev_lsn and last_lsn else if query_string.starts_with("get_last_record_rlsn ") { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index d55d159037..7ee3c33f92 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -62,6 +62,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", + *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (