From 77a68326c5bf957e905654c69642da3caca649aa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Jul 2023 11:15:54 +0300 Subject: [PATCH] Thin out TenantState metric, keep set of broken tenants (#4796) We currently have a timeseries for each of the tenants in different states. We only want this for Broken. Other states could be counters. Fix this by making the `pageserver_tenant_states_count` a counter without a `tenant_id` and add a `pageserver_broken_tenants_count` which has a `tenant_id` label, each broken tenant being 1. --- pageserver/src/metrics.rs | 22 ++++-- pageserver/src/tenant.rs | 56 +++++++++------ pageserver/src/tenant/timeline.rs | 4 +- test_runner/fixtures/metrics.py | 3 +- test_runner/regress/test_tenant_detach.py | 88 +++++++++++++++++++++++ test_runner/regress/test_tenants.py | 4 +- 6 files changed, 145 insertions(+), 32 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index bdd79711ec..c233f36da6 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -6,7 +6,6 @@ use metrics::{ IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; -use pageserver_api::models::TenantState; use strum::VariantNames; use strum_macros::{EnumVariantNames, IntoStaticStr}; use utils::id::{TenantId, TimelineId}; @@ -306,11 +305,24 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); -pub static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { +pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", "Count of tenants per state", - &["tenant_id", "state"] + &["state"] + ) + .expect("Failed to register pageserver_tenant_states_count metric") +}); + +/// A set of broken tenants. +/// +/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken +/// tenant. +pub(crate) static BROKEN_TENANTS_SET: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_broken_tenants_count", + "Set of broken tenants", + &["tenant_id"] ) .expect("Failed to register pageserver_tenant_states_count metric") }); @@ -1023,9 +1035,7 @@ impl Drop for TimelineMetrics { pub fn remove_tenant_metrics(tenant_id: &TenantId) { let tid = tenant_id.to_string(); let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); - for state in TenantState::VARIANTS { - let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]); - } + // we leave the BROKEN_TENANTS_SET entry if any } use futures::Future; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3bf88f2729..654f5f133f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2199,28 +2199,44 @@ impl Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { - let mut current_state: &'static str = From::from(&*rx.borrow_and_update()); let tid = tenant_id.to_string(); - TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state]) - .inc(); - loop { - match rx.changed().await { - Ok(()) => { - let new_state: &'static str = From::from(&*rx.borrow_and_update()); - TENANT_STATE_METRIC - .with_label_values(&[&tid, current_state]) - .dec(); - TENANT_STATE_METRIC - .with_label_values(&[&tid, new_state]) - .inc(); - current_state = new_state; - } - Err(_sender_dropped_error) => { - info!("Tenant dropped the state updates sender, quitting waiting for tenant state change"); - return; - } + fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { + ([state.into()], matches!(state, TenantState::Broken { .. })) + } + + let mut tuple = inspect_state(&rx.borrow_and_update()); + + let is_broken = tuple.1; + if !is_broken { + // the tenant might be ignored and reloaded, so first remove any previous set + // element. it most likely has already been scraped, as these are manual operations + // right now. most likely we will add it back very soon. + drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid])); + } + + loop { + let labels = &tuple.0; + let current = TENANT_STATE_METRIC.with_label_values(labels); + current.inc(); + + if rx.changed().await.is_err() { + // tenant has been dropped; decrement the counter because a tenant with that + // state is no longer in tenant map, but allow any broken set item to exist + // still. + current.dec(); + break; + } + + current.dec(); + tuple = inspect_state(&rx.borrow_and_update()); + + let is_broken = tuple.1; + if is_broken { + // insert the tenant_id (back) into the set + crate::metrics::BROKEN_TENANTS_SET + .with_label_values(&[&tid]) + .inc(); } } }); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a7daf09021..c663a4f9ad 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -334,7 +334,7 @@ pub struct GcInfo { #[derive(thiserror::Error)] pub enum PageReconstructError { #[error(transparent)] - Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + Other(#[from] anyhow::Error), /// The operation would require downloading a layer that is missing locally. NeedsDownload(TenantTimelineId, LayerFileName), @@ -925,7 +925,7 @@ impl Timeline { new_state, TimelineState::Stopping | TimelineState::Broken { .. } ) { - // drop the copmletion guard, if any; it might be holding off the completion + // drop the completion guard, if any; it might be holding off the completion // forever needlessly self.initial_logical_size_attempt .lock() diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index ca468cc02c..52d32df43e 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -76,6 +76,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( *histogram("pageserver_remote_operation_seconds"), *histogram("pageserver_remote_timeline_client_calls_started"), *histogram("pageserver_io_operations_seconds"), + "pageserver_tenant_states_count", ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( @@ -90,8 +91,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_storage_operations_seconds_sum_total", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", - "pageserver_tenant_states_count", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, + # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 6803f6dbb1..77f93dbd92 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -2,6 +2,7 @@ import asyncio import random import time from threading import Thread +from typing import List, Optional import asyncpg import pytest @@ -21,6 +22,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until +from prometheus_client.samples import Sample def do_gc_target( @@ -854,3 +856,89 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): assert ( query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data ), "Should have timeline data back" + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_metrics_while_ignoring_broken_tenant_and_reloading( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_metrics_while_ignoring_broken_tenant_and_reloading", + ) + + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + + def only_int(samples: List[Sample]) -> Optional[int]: + if len(samples) == 1: + return int(samples[0].value) + assert len(samples) == 0 + return None + + wait_until_tenant_state(client, env.initial_tenant, "Active", 10, 0.5) + + client.tenant_break(env.initial_tenant) + + found_broken = False + active, broken, broken_set = ([], [], []) + for _ in range(10): + m = client.get_metrics() + active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) + broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) + broken_set = m.query_all( + "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} + ) + found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 + + if found_broken: + break + log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}") + time.sleep(0.5) + assert ( + found_broken + ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}" + + client.tenant_ignore(env.initial_tenant) + + found_broken = False + broken, broken_set = ([], []) + for _ in range(10): + m = client.get_metrics() + broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) + broken_set = m.query_all( + "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} + ) + found_broken = only_int(broken) == 0 and only_int(broken_set) == 1 + + if found_broken: + break + time.sleep(0.5) + assert ( + found_broken + ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + + client.tenant_load(env.initial_tenant) + + found_active = False + active, broken_set = ([], []) + for _ in range(10): + m = client.get_metrics() + active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) + broken_set = m.query_all( + "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} + ) + found_active = only_int(active) == 1 and len(broken_set) == 0 + + if found_active: + break + time.sleep(0.5) + + assert ( + found_active + ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 35f7941bda..e327910138 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -383,10 +383,8 @@ def test_pageserver_with_empty_tenants( ps_metrics = client.get_metrics() broken_tenants_metric_filter = { "tenant_id": str(tenant_without_timelines_dir), - "state": "Broken", } active_tenants_metric_filter = { - "tenant_id": str(tenant_with_empty_timelines), "state": "Active", } @@ -402,7 +400,7 @@ def test_pageserver_with_empty_tenants( tenant_broken_count = int( ps_metrics.query_one( - "pageserver_tenant_states_count", filter=broken_tenants_metric_filter + "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter ).value )