From 42e19e952fedefd9b591f7dc951e7d934990508c Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 26 Sep 2024 11:38:19 -0400 Subject: [PATCH] fix(pageserver): categorize client error in basebackup metrics (#9110) We separated client error from basebackup error log lines in https://github.com/neondatabase/neon/pull/7523, but we didn't do anything for the metrics. In this patch, we fixed it. ref https://github.com/neondatabase/neon/issues/8970 ## Summary of changes We use the same criteria as in `log_query_error` producing an info line (instead of error) for the metrics. We added a `client_error` category for the basebackup query time metrics. --------- Signed-off-by: Alex Chi Z --- libs/postgres_backend/src/lib.rs | 1 + pageserver/src/metrics.rs | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index e274d24585..085540e7b9 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String { } fn log_query_error(query: &str, e: &QueryError) { + // If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`. match e { QueryError::Disconnected(ConnectionError::Io(io_error)) => { if is_expected_io_error(io_error) { diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 366bd82903..b76efa5b48 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,6 +8,8 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; +use postgres_backend::{is_expected_io_error, QueryError}; +use pq_proto::framed::ConnectionError; use strum::{EnumCount, VariantNames}; use strum_macros::{IntoStaticStr, VariantNames}; use tracing::warn; @@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| { pub(crate) struct BasebackupQueryTime { ok: Histogram, error: Histogram, + client_error: Histogram, } pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(|| { @@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| BasebackupQueryTime { ok: vec.get_metric_with_label_values(&["ok"]).unwrap(), error: vec.get_metric_with_label_values(&["error"]).unwrap(), + client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(), } }); @@ -1557,7 +1561,7 @@ impl BasebackupQueryTime { } impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { - pub(crate) fn observe(self, res: &Result) { + pub(crate) fn observe(self, res: &Result) { let elapsed = self.start.elapsed(); let ex_throttled = self .ctx @@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { elapsed } }; - let metric = if res.is_ok() { - &self.parent.ok - } else { - &self.parent.error + // If you want to change categorize of a specific error, also change it in `log_query_error`. + let metric = match res { + Ok(_) => &self.parent.ok, + Err(QueryError::Disconnected(ConnectionError::Io(io_error))) + if is_expected_io_error(io_error) => + { + &self.parent.client_error + } + Err(_) => &self.parent.error, }; metric.observe(ex_throttled.as_secs_f64()); }