pageserver: shutdown timeline metrics during delete

pageserver: implement Drop for TimelineMetrics
2026-05-15 04:00:38 +00:00 · 2024-03-25 09:49:05 +00:00 · 2024-03-25 09:49:05 +00:00
3 changed files with 41 additions and 0 deletions
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1876,6 +1876,7 @@ impl StorageTimeMetrics {

 #[derive(Debug)]
 pub(crate) struct TimelineMetrics {
+    registered: AtomicBool,
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
@@ -1979,6 +1980,7 @@ impl TimelineMetrics {
            .build(&tenant_id, &shard_id, &timeline_id);

        TimelineMetrics {
+            registered: AtomicBool::new(true),
            tenant_id,
            shard_id,
            timeline_id,
@@ -2018,6 +2020,9 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

+    /// For use when callers would like to de-register statistics before dropping the object, for example
+    /// because the Timeline might outlive its logical lifetime due to Arc references, and we don't want
+    /// to risk two timelines with the same ID fighting for the metrics.
    pub(crate) fn shutdown(&self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
@@ -2070,6 +2075,19 @@ impl TimelineMetrics {
                timeline_id,
            ]);
        }
+
+        self.registered
+            .store(false, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+impl Drop for TimelineMetrics {
+    fn drop(&mut self) {
+        // We usually expect shutdown() to have been called explicitly, but to avoid a fragile dependency on
+        // well behaved callers, also de-register on drop.
+        if self.registered.load(std::sync::atomic::Ordering::Relaxed) {
+            self.shutdown();
+        }
    }
 }

@@ -2087,6 +2105,7 @@ use futures::Future;
 use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::pin::Pin;
+use std::sync::atomic::AtomicBool;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -26,6 +26,10 @@ use crate::{
 use super::{Timeline, TimelineResources};

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+///
+/// This is essentially a hand-crafted subset of Timeline::shutdown, which exists because we
+/// rely on keeping Timeline partially alive in order to access its RemoteTimelineClient for remote
+/// deletion.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    // Notify any timeline work to drop out of loops/requests
@@ -82,6 +86,8 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
        ))?
    });

+    timeline.metrics.shutdown();
+
    tracing::debug!("Waiting for gate...");
    timeline.gate.close().await;
    tracing::debug!("Shutdown complete");
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -88,6 +88,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
    assert timeline_path.exists()

+    # Before deleting, timeline metrics should be present
+    assert (
+        ps_http.get_metric_value(
+            "pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
+        )
+        is not None
+    )
+
    # retry deletes when compaction or gc is running in pageserver
    # TODO: review whether this wait_until is actually necessary, we do an await() internally
    wait_until(
@@ -106,6 +114,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
    assert exc.value.status_code == 404

+    # Check metrics were cleaned up
+    assert (
+        ps_http.get_metric_value(
+            "pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
+        )
+        is None
+    )
+
    wait_until(
        number_of_iterations=3,
        interval=0.2,
Author	SHA1	Message	Date
John Spray	a7e8e4b449	pageserver: shutdown timeline metrics during delete	2024-03-25 09:49:05 +00:00
John Spray	d5e2fc4dfc	pageserver: implement Drop for TimelineMetrics	2024-03-25 09:49:05 +00:00