From e2db76b9be5fc0de8f953dc4ba9f039ce05cdd95 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 30 Apr 2025 12:04:00 -0400 Subject: [PATCH] feat(pageserver): ondemand download reason observability (#11780) ## Problem Part of https://github.com/neondatabase/neon/issues/11615 ## Summary of changes We don't understand the root cause of why we get resident size surge every now and then. This patch adds observability for that, and in the next week, we might have a better understanding of what's going on. --------- Signed-off-by: Alex Chi Z --- pageserver/src/metrics.rs | 18 ++++++++++++++++++ pageserver/src/tenant/storage_layer/layer.rs | 9 +++++++++ 2 files changed, 27 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index a68b6acca1..8e4dbd6c3e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_bytes_total", + "Total bytes of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + +pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_ondemand_download_count", + "Total count of layers on-demand downloaded", + &["task_kind"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod wait_ondemand_download_time { use super::*; const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[ diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index b7f6e5dc77..50810cb154 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Weak}; use std::time::{Duration, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT}; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; @@ -1255,6 +1256,14 @@ impl LayerInner { self.access_stats.record_residence_event(); + let task_kind: &'static str = ctx.task_kind().into(); + ONDEMAND_DOWNLOAD_BYTES + .with_label_values(&[task_kind]) + .inc_by(self.desc.file_size); + ONDEMAND_DOWNLOAD_COUNT + .with_label_values(&[task_kind]) + .inc(); + Ok(self.initialize_after_layer_is_on_disk(permit)) } Err(e) => {