From 8224580f3e0517a9d5792d2ddae275c0e26377d6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 11 Mar 2024 15:41:41 +0100 Subject: [PATCH] fix(tenant/timeline metrics): race condition during shutdown + recreation (#7064) Tenant::shutdown or Timeline::shutdown completes and becomes externally observable before the corresponding Tenant/Timeline object is dropped. For example, after observing a Tenant::shutdown to complete, we could attach the same tenant_id again. The shut down Tenant object might still be around at the time of the attach. The race is then the following: - old object's metrics are still around - new object uses with_label_values - old object calls remove_label_values The outcome is that the new object will have the metric objects (they're an Arc internall) but the metrics won't be part of the internal registry and hence they'll be missing in `/metrics`. Later, when the new object gets shut down and tries to remove_label_value, it will observe an error because the metric was already removed by the old object. Changes ------- This PR moves metric removal to `shutdown()`. An alternative design would be to multi-version the metrics using a distinguishing label, or, to use a better metrics crate that allows removing metrics from the registry through the locally held metric handle instead of interacting with the (globally shared) registry. refs https://github.com/neondatabase/neon/pull/7051 --- pageserver/src/metrics.rs | 4 +--- pageserver/src/tenant.rs | 7 ++----- pageserver/src/tenant/timeline.rs | 2 ++ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 74e91210fc..814b3e1f96 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -2017,10 +2017,8 @@ impl TimelineMetrics { pub(crate) fn resident_physical_size_get(&self) -> u64 { self.resident_physical_size_gauge.get() } -} -impl Drop for TimelineMetrics { - fn drop(&mut self) { + pub(crate) fn shutdown(&self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4f4654422b..961995b2d6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1846,6 +1846,8 @@ impl Tenant { // Wait for any in-flight operations to complete self.gate.close().await; + remove_tenant_metrics(&self.tenant_shard_id); + Ok(()) } @@ -3557,11 +3559,6 @@ async fn run_initdb( Ok(()) } -impl Drop for Tenant { - fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_shard_id); - } -} /// Dump contents of a layer file to stdout. pub async fn dump_layerfile_from_path( path: &Utf8Path, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7004db1cb5..c017d30f45 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1257,6 +1257,8 @@ impl Timeline { // Finally wait until any gate-holders are complete self.gate.close().await; + + self.metrics.shutdown(); } pub(crate) fn set_state(&self, new_state: TimelineState) {