From beefc7a8108e5af333bc1e453749acf872f18fdd Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 20 Aug 2024 19:47:42 +0100 Subject: [PATCH] pageserver: add metric pageserver_secondary_heatmap_total_size (#8768) ## Problem We don't have a convenient way for a human to ask "how far are secondary downloads along for this tenant". This is useful when driving migrations of tenants to the storage controller, as we first create a secondary location and want to see it warm up before we cut over. That can already be done via storcon_cli, but we would like a way that doesn't require direct API access. ## Summary of changes Add a metric that reports to total size of layers in the heatmap: this may be used in conjunction with the existing `pageserver_secondary_resident_physical_size` to estimate "warmth" of the secondary location. --- pageserver/src/metrics.rs | 9 +++++++++ pageserver/src/tenant/secondary.rs | 10 ++++++++++ pageserver/src/tenant/secondary/downloader.rs | 6 ++++++ 3 files changed, 25 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index cd2cd43f27..1bc9352256 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1803,6 +1803,15 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::n .expect("failed to define a metric") }); +pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_heatmap_total_size", + "The total size in bytes of all layers in the most recently downloaded heatmap.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 3132a28b12..1331c07d05 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, + metrics::SECONDARY_HEATMAP_TOTAL_SIZE, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, }; @@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant { // Sum of layer sizes on local disk pub(super) resident_size_metric: UIntGauge, + + // Sum of layer sizes in the most recently downloaded heatmap + pub(super) heatmap_total_size_metric: UIntGauge, } impl Drop for SecondaryTenant { @@ -112,6 +116,7 @@ impl Drop for SecondaryTenant { let tenant_id = self.tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); } } @@ -128,6 +133,10 @@ impl SecondaryTenant { .get_metric_with_label_values(&[&tenant_id, &shard_id]) .unwrap(); + let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + Arc::new(Self { tenant_shard_id, // todo: shall we make this a descendent of the @@ -145,6 +154,7 @@ impl SecondaryTenant { progress: std::sync::Mutex::default(), resident_size_metric, + heatmap_total_size_metric, }) } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8cff1d2864..90e1c01dbd 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> { layers_downloaded: 0, bytes_downloaded: 0, }; + + // Also expose heatmap bytes_total as a metric + self.secondary_state + .heatmap_total_size_metric + .set(heatmap_stats.bytes); + // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock let mut delete_layers = Vec::new(); let mut delete_timelines = Vec::new();