pageserver: shutdown timeline metrics during delete

This commit is contained in:
John Spray
2024-03-25 09:45:52 +00:00
parent d5e2fc4dfc
commit a7e8e4b449
2 changed files with 22 additions and 0 deletions

View File

@@ -26,6 +26,10 @@ use crate::{
use super::{Timeline, TimelineResources};
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
///
/// This is essentially a hand-crafted subset of Timeline::shutdown, which exists because we
/// rely on keeping Timeline partially alive in order to access its RemoteTimelineClient for remote
/// deletion.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Notify any timeline work to drop out of loops/requests
@@ -82,6 +86,8 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
))?
});
timeline.metrics.shutdown();
tracing::debug!("Waiting for gate...");
timeline.gate.close().await;
tracing::debug!("Shutdown complete");

View File

@@ -88,6 +88,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
assert timeline_path.exists()
# Before deleting, timeline metrics should be present
assert (
ps_http.get_metric_value(
"pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
)
is not None
)
# retry deletes when compaction or gc is running in pageserver
# TODO: review whether this wait_until is actually necessary, we do an await() internally
wait_until(
@@ -106,6 +114,14 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
assert exc.value.status_code == 404
# Check metrics were cleaned up
assert (
ps_http.get_metric_value(
"pageserver_current_logical_size", {"timeline_id": str(leaf_timeline_id)}
)
is None
)
wait_until(
number_of_iterations=3,
interval=0.2,