From 136cf1979baf96ff345fa3ff75f619c4d22ccd8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 21 May 2025 13:28:22 +0200 Subject: [PATCH] Add metric for number of offloaded timelines (#11976) We want to keep track of the number of offloaded timelines. It's a per-tenant shard metric because each shard makes offloading decisions on its own. --- pageserver/src/metrics.rs | 14 +++++++++++++- pageserver/src/tenant.rs | 19 +++++++++++++++++-- test_runner/fixtures/metrics.py | 1 + test_runner/regress/test_timeline_archive.py | 10 ++++++++++ 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c50f730f41..eae3045a3b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1066,6 +1066,15 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); +pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tenant_offloaded_timelines", + "Number of offloaded timelines of a tenant", + &["tenant_id", "shard_id"] + ) + .expect("Failed to register pageserver_tenant_offloaded_timelines metric") +}); + pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -3551,11 +3560,14 @@ impl TimelineMetrics { } pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { + let tid = tenant_shard_id.tenant_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + // Only shard zero deals in synthetic sizes if tenant_shard_id.is_shard_zero() { - let tid = tenant_shard_id.tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); } + let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]); tenant_throttling::remove_tenant_metrics(tenant_shard_id); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fffd1f4090..35ddba355d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -86,8 +86,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, - INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC, - TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, + INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES, + TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics, }; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; @@ -3348,6 +3348,13 @@ impl TenantShard { activated_timelines += 1; } + let tid = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let offloaded_timeline_count = timelines_offloaded_accessor.len(); + TENANT_OFFLOADED_TIMELINES + .with_label_values(&[&tid, &shard_id]) + .set(offloaded_timeline_count as u64); + self.state.send_modify(move |current_state| { assert!( matches!(current_state, TenantState::Activating(_)), @@ -5560,6 +5567,14 @@ impl TenantShard { } } + // Update metrics + let tid = self.tenant_shard_id.to_string(); + let shard_id = self.tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; + TENANT_OFFLOADED_TIMELINES + .with_label_values(set_key) + .set(manifest.offloaded_timelines.len() as u64); + // Upload the manifest. Remote storage does no retries internally, so retry here. match backoff::retry( || async { diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 879808b7ba..1dd4fe8316 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -184,6 +184,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", + "pageserver_tenant_offloaded_timelines", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 4360b42d68..8d46ef8306 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -193,6 +193,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 0 + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -244,6 +249,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b wait_until(leaf_offloaded) wait_until(parent_offloaded) + offloaded_count = ps_http.get_metric_value( + "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"} + ) + assert offloaded_count == 2 + # Offloaded child timelines should still prevent deletion with pytest.raises( PageserverApiException,