diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index dd4f9107f9..b801347c06 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,9 @@ use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT; -use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; +use crate::metrics::{ + remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, +}; use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; @@ -2637,9 +2639,16 @@ impl Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - // Strings for metric labels + // reflect tenant state in metrics: + // - global per tenant state: TENANT_STATE_METRIC + // - "set" of broken tenants: BROKEN_TENANTS_SET + // + // set of broken tenants should not have zero counts so that it remains accessible for + // alerting. + let tid = tenant_shard_id.to_string(); - let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2648,21 +2657,13 @@ impl Tenant { let mut tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; - let mut counted_broken = if !is_broken { - // the tenant might be ignored and reloaded, so first remove any previous set - // element. it most likely has already been scraped, as these are manual operations - // right now. most likely we will add it back very soon. - drop( - crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]), - ); - false - } else { + let mut counted_broken = if is_broken { // add the id to the set right away, there should not be any updates on the channel - // after - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .set(1); + // after before tenant is removed, if ever + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); true + } else { + false }; loop { @@ -2671,10 +2672,9 @@ impl Tenant { current.inc(); if rx.changed().await.is_err() { - // tenant has been dropped; decrement the counter because a tenant with that - // state is no longer in tenant map, but allow any broken set item to exist - // still. + // tenant has been dropped current.dec(); + drop(BROKEN_TENANTS_SET.remove_label_values(set_key)); break; } @@ -2684,10 +2684,9 @@ impl Tenant { let is_broken = tuple.1; if is_broken && !counted_broken { counted_broken = true; - // insert the tenant_id (back) into the set - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid, &shard_id_str]) - .inc(); + // insert the tenant_id (back) into the set while avoiding needless counter + // access + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); } } }); diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7c489bda67..ef41774289 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, - # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload + # "pageserver_broken_tenants_count" -- used only for broken ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8d5ef4e3c4..4752699abb 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): def test_metrics_while_ignoring_broken_tenant_and_reloading( neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( client.tenant_break(env.initial_tenant) - found_broken = False - active, broken, broken_set = ([], [], []) - for _ in range(10): + def found_broken(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 + assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - if found_broken: - break - log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}") - time.sleep(0.5) - assert ( - found_broken - ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_broken) client.tenant_ignore(env.initial_tenant) - found_broken = False - broken, broken_set = ([], []) - for _ in range(10): + def found_cleaned_up(): m = client.get_metrics() broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(broken) == 0 and only_int(broken_set) == 1 + assert only_int(broken) == 0 and len(broken_set) == 0 - if found_broken: - break - time.sleep(0.5) - assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_cleaned_up) env.pageserver.tenant_load(env.initial_tenant) - found_active = False - active, broken_set = ([], []) - for _ in range(10): + def found_active(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_active = only_int(active) == 1 and len(broken_set) == 0 + assert only_int(active) == 1 and len(broken_set) == 0 - if found_active: - break - time.sleep(0.5) - - assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + wait_until(10, 0.5, found_active)