Compare commits

...

2 Commits

Author SHA1 Message Date
John Spray
a7e8e4b449 pageserver: shutdown timeline metrics during delete 2024-03-25 09:49:05 +00:00
John Spray
d5e2fc4dfc pageserver: implement Drop for TimelineMetrics 2024-03-25 09:49:05 +00:00
3 changed files with 41 additions and 0 deletions

View File

@@ -1876,6 +1876,7 @@ impl StorageTimeMetrics {
#[derive(Debug)]
pub(crate) struct TimelineMetrics {
registered: AtomicBool,
tenant_id: String,
shard_id: String,
timeline_id: String,
@@ -1979,6 +1980,7 @@ impl TimelineMetrics {
.build(&tenant_id, &shard_id, &timeline_id);
TimelineMetrics {
registered: AtomicBool::new(true),
tenant_id,
shard_id,
timeline_id,
@@ -2018,6 +2020,9 @@ impl TimelineMetrics {
self.resident_physical_size_gauge.get()
}
/// For use when callers would like to de-register statistics before dropping the object, for example
/// because the Timeline might outlive its logical lifetime due to Arc references, and we don't want
/// to risk two timelines with the same ID fighting for the metrics.
pub(crate) fn shutdown(&self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
@@ -2070,6 +2075,19 @@ impl TimelineMetrics {
timeline_id,
]);
}
self.registered
.store(false, std::sync::atomic::Ordering::Relaxed);
}
}
impl Drop for TimelineMetrics {
fn drop(&mut self) {
// We usually expect shutdown() to have been called explicitly, but to avoid a fragile dependency on
// well behaved callers, also de-register on drop.
if self.registered.load(std::sync::atomic::Ordering::Relaxed) {
self.shutdown();
}
}
}
@@ -2087,6 +2105,7 @@ use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::pin::Pin;
use std::sync::atomic::AtomicBool;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};

View File

@@ -26,6 +26,10 @@ use crate::{
use super::{Timeline, TimelineResources};
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
///
/// This is essentially a hand-crafted subset of Timeline::shutdown, which exists because we
/// rely on keeping Timeline partially alive in order to access its RemoteTimelineClient for remote
/// deletion.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Notify any timeline work to drop out of loops/requests
@@ -82,6 +86,8 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
))?
});
timeline.metrics.shutdown();
tracing::debug!("Waiting for gate...");
timeline.gate.close().await;
tracing::debug!("Shutdown complete");

View File

@@ -88,6 +88,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
assert timeline_path.exists()
# Before deleting, timeline metrics should be present
assert (
ps_http.get_metric_value(
"pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
)
is not None
)
# retry deletes when compaction or gc is running in pageserver
# TODO: review whether this wait_until is actually necessary, we do an await() internally
wait_until(
@@ -106,6 +114,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
assert exc.value.status_code == 404
# Check metrics were cleaned up
assert (
ps_http.get_metric_value(
"pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
)
is None
)
wait_until(
number_of_iterations=3,
interval=0.2,