mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 21:40:39 +00:00
Thin out TenantState metric, keep set of broken tenants (#4796)
We currently have a timeseries for each of the tenants in different states. We only want this for Broken. Other states could be counters. Fix this by making the `pageserver_tenant_states_count` a counter without a `tenant_id` and add a `pageserver_broken_tenants_count` which has a `tenant_id` label, each broken tenant being 1.
This commit is contained in:
@@ -6,7 +6,6 @@ use metrics::{
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
use strum::VariantNames;
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -306,11 +305,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
"Count of tenants per state",
|
||||
&["tenant_id", "state"]
|
||||
&["state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
/// tenant.
|
||||
pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_broken_tenants_count",
|
||||
"Set of broken tenants",
|
||||
&["tenant_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
@@ -1023,9 +1035,7 @@ impl Drop for TimelineMetrics {
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let tid = tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
for state in TenantState::VARIANTS {
|
||||
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||
}
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
|
||||
@@ -2199,28 +2199,44 @@ impl Tenant {
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut current_state: &'static str = From::from(&*rx.borrow_and_update());
|
||||
let tid = tenant_id.to_string();
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, current_state])
|
||||
.inc();
|
||||
loop {
|
||||
match rx.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state: &'static str = From::from(&*rx.borrow_and_update());
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, current_state])
|
||||
.dec();
|
||||
TENANT_STATE_METRIC
|
||||
.with_label_values(&[&tid, new_state])
|
||||
.inc();
|
||||
|
||||
current_state = new_state;
|
||||
}
|
||||
Err(_sender_dropped_error) => {
|
||||
info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
|
||||
return;
|
||||
}
|
||||
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
|
||||
([state.into()], matches!(state, TenantState::Broken { .. }))
|
||||
}
|
||||
|
||||
let mut tuple = inspect_state(&rx.borrow_and_update());
|
||||
|
||||
let is_broken = tuple.1;
|
||||
if !is_broken {
|
||||
// the tenant might be ignored and reloaded, so first remove any previous set
|
||||
// element. it most likely has already been scraped, as these are manual operations
|
||||
// right now. most likely we will add it back very soon.
|
||||
drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
|
||||
}
|
||||
|
||||
loop {
|
||||
let labels = &tuple.0;
|
||||
let current = TENANT_STATE_METRIC.with_label_values(labels);
|
||||
current.inc();
|
||||
|
||||
if rx.changed().await.is_err() {
|
||||
// tenant has been dropped; decrement the counter because a tenant with that
|
||||
// state is no longer in tenant map, but allow any broken set item to exist
|
||||
// still.
|
||||
current.dec();
|
||||
break;
|
||||
}
|
||||
|
||||
current.dec();
|
||||
tuple = inspect_state(&rx.borrow_and_update());
|
||||
|
||||
let is_broken = tuple.1;
|
||||
if is_broken {
|
||||
// insert the tenant_id (back) into the set
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -334,7 +334,7 @@ pub struct GcInfo {
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum PageReconstructError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
|
||||
Other(#[from] anyhow::Error),
|
||||
|
||||
/// The operation would require downloading a layer that is missing locally.
|
||||
NeedsDownload(TenantTimelineId, LayerFileName),
|
||||
@@ -925,7 +925,7 @@ impl Timeline {
|
||||
new_state,
|
||||
TimelineState::Stopping | TimelineState::Broken { .. }
|
||||
) {
|
||||
// drop the copmletion guard, if any; it might be holding off the completion
|
||||
// drop the completion guard, if any; it might be holding off the completion
|
||||
// forever needlessly
|
||||
self.initial_logical_size_attempt
|
||||
.lock()
|
||||
|
||||
@@ -76,6 +76,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
*histogram("pageserver_remote_operation_seconds"),
|
||||
*histogram("pageserver_remote_timeline_client_calls_started"),
|
||||
*histogram("pageserver_io_operations_seconds"),
|
||||
"pageserver_tenant_states_count",
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
@@ -90,8 +91,8 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_storage_operations_seconds_sum_total",
|
||||
"pageserver_created_persistent_files_total",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"pageserver_tenant_states_count",
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
|
||||
)
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import random
|
||||
import time
|
||||
from threading import Thread
|
||||
from typing import List, Optional
|
||||
|
||||
import asyncpg
|
||||
import pytest
|
||||
@@ -21,6 +22,7 @@ from fixtures.pageserver.utils import (
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
from prometheus_client.samples import Sample
|
||||
|
||||
|
||||
def do_gc_target(
|
||||
@@ -854,3 +856,89 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
|
||||
assert (
|
||||
query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
|
||||
), "Should have timeline data back"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
env.pageserver.allowed_errors.append(
|
||||
r".* Changing Active tenant to Broken state, reason: broken from test"
|
||||
)
|
||||
|
||||
def only_int(samples: List[Sample]) -> Optional[int]:
|
||||
if len(samples) == 1:
|
||||
return int(samples[0].value)
|
||||
assert len(samples) == 0
|
||||
return None
|
||||
|
||||
wait_until_tenant_state(client, env.initial_tenant, "Active", 10, 0.5)
|
||||
|
||||
client.tenant_break(env.initial_tenant)
|
||||
|
||||
found_broken = False
|
||||
active, broken, broken_set = ([], [], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
|
||||
|
||||
if found_broken:
|
||||
break
|
||||
log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
|
||||
time.sleep(0.5)
|
||||
assert (
|
||||
found_broken
|
||||
), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
|
||||
|
||||
client.tenant_ignore(env.initial_tenant)
|
||||
|
||||
found_broken = False
|
||||
broken, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
|
||||
|
||||
if found_broken:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert (
|
||||
found_broken
|
||||
), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
||||
|
||||
client.tenant_load(env.initial_tenant)
|
||||
|
||||
found_active = False
|
||||
active, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_active = only_int(active) == 1 and len(broken_set) == 0
|
||||
|
||||
if found_active:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
assert (
|
||||
found_active
|
||||
), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
|
||||
|
||||
@@ -383,10 +383,8 @@ def test_pageserver_with_empty_tenants(
|
||||
ps_metrics = client.get_metrics()
|
||||
broken_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_without_timelines_dir),
|
||||
"state": "Broken",
|
||||
}
|
||||
active_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_with_empty_timelines),
|
||||
"state": "Active",
|
||||
}
|
||||
|
||||
@@ -402,7 +400,7 @@ def test_pageserver_with_empty_tenants(
|
||||
|
||||
tenant_broken_count = int(
|
||||
ps_metrics.query_one(
|
||||
"pageserver_tenant_states_count", filter=broken_tenants_metric_filter
|
||||
"pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
|
||||
).value
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user