metrics: remove broken tenants (#6586)

Before tenant migration it made sense to leak broken tenants in the
metrics until restart. Nowdays it makes less sense because on
cancellations we set the tenant broken. The set metric still allows
filterable alerting.

Fixes: #6507
This commit is contained in:
Joonas Koivunen
2024-02-05 14:49:35 +02:00
committed by GitHub
parent db89b13aaa
commit 5e8deca268
3 changed files with 32 additions and 54 deletions

View File

@@ -67,7 +67,9 @@ use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT;
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
};
use crate::repository::GcResult;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
@@ -2637,9 +2639,16 @@ impl Tenant {
let (state, mut rx) = watch::channel(state);
tokio::spawn(async move {
// Strings for metric labels
// reflect tenant state in metrics:
// - global per tenant state: TENANT_STATE_METRIC
// - "set" of broken tenants: BROKEN_TENANTS_SET
//
// set of broken tenants should not have zero counts so that it remains accessible for
// alerting.
let tid = tenant_shard_id.to_string();
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
let shard_id = tenant_shard_id.shard_slug().to_string();
let set_key = &[tid.as_str(), shard_id.as_str()][..];
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2648,21 +2657,13 @@ impl Tenant {
let mut tuple = inspect_state(&rx.borrow_and_update());
let is_broken = tuple.1;
let mut counted_broken = if !is_broken {
// the tenant might be ignored and reloaded, so first remove any previous set
// element. it most likely has already been scraped, as these are manual operations
// right now. most likely we will add it back very soon.
drop(
crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
);
false
} else {
let mut counted_broken = if is_broken {
// add the id to the set right away, there should not be any updates on the channel
// after
crate::metrics::BROKEN_TENANTS_SET
.with_label_values(&[&tid, &shard_id_str])
.set(1);
// after before tenant is removed, if ever
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
true
} else {
false
};
loop {
@@ -2671,10 +2672,9 @@ impl Tenant {
current.inc();
if rx.changed().await.is_err() {
// tenant has been dropped; decrement the counter because a tenant with that
// state is no longer in tenant map, but allow any broken set item to exist
// still.
// tenant has been dropped
current.dec();
drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
break;
}
@@ -2684,10 +2684,9 @@ impl Tenant {
let is_broken = tuple.1;
if is_broken && !counted_broken {
counted_broken = true;
// insert the tenant_id (back) into the set
crate::metrics::BROKEN_TENANTS_SET
.with_label_values(&[&tid, &shard_id_str])
.inc();
// insert the tenant_id (back) into the set while avoiding needless counter
// access
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
}
}
});

View File

@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_evictions_total",
"pageserver_evictions_with_low_residence_duration_total",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
# "pageserver_broken_tenants_count" -- used only for broken
)

View File

@@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
def test_metrics_while_ignoring_broken_tenant_and_reloading(
neon_env_builder: NeonEnvBuilder,
):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
client = env.pageserver.http_client()
@@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
client.tenant_break(env.initial_tenant)
found_broken = False
active, broken, broken_set = ([], [], [])
for _ in range(10):
def found_broken():
m = client.get_metrics()
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
broken_set = m.query_all(
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
)
found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
if found_broken:
break
log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
time.sleep(0.5)
assert (
found_broken
), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
wait_until(10, 0.5, found_broken)
client.tenant_ignore(env.initial_tenant)
found_broken = False
broken, broken_set = ([], [])
for _ in range(10):
def found_cleaned_up():
m = client.get_metrics()
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
broken_set = m.query_all(
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
)
found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
assert only_int(broken) == 0 and len(broken_set) == 0
if found_broken:
break
time.sleep(0.5)
assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
wait_until(10, 0.5, found_cleaned_up)
env.pageserver.tenant_load(env.initial_tenant)
found_active = False
active, broken_set = ([], [])
for _ in range(10):
def found_active():
m = client.get_metrics()
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
broken_set = m.query_all(
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
)
found_active = only_int(active) == 1 and len(broken_set) == 0
assert only_int(active) == 1 and len(broken_set) == 0
if found_active:
break
time.sleep(0.5)
assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
wait_until(10, 0.5, found_active)