mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 15:02:56 +00:00
metrics: remove broken tenants (#6586)
Before tenant migration it made sense to leak broken tenants in the metrics until restart. Nowdays it makes less sense because on cancellations we set the tenant broken. The set metric still allows filterable alerting. Fixes: #6507
This commit is contained in:
@@ -67,7 +67,9 @@ use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
|
||||
use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -2637,9 +2639,16 @@ impl Tenant {
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
// Strings for metric labels
|
||||
// reflect tenant state in metrics:
|
||||
// - global per tenant state: TENANT_STATE_METRIC
|
||||
// - "set" of broken tenants: BROKEN_TENANTS_SET
|
||||
//
|
||||
// set of broken tenants should not have zero counts so that it remains accessible for
|
||||
// alerting.
|
||||
|
||||
let tid = tenant_shard_id.to_string();
|
||||
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
|
||||
let shard_id = tenant_shard_id.shard_slug().to_string();
|
||||
let set_key = &[tid.as_str(), shard_id.as_str()][..];
|
||||
|
||||
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
|
||||
([state.into()], matches!(state, TenantState::Broken { .. }))
|
||||
@@ -2648,21 +2657,13 @@ impl Tenant {
|
||||
let mut tuple = inspect_state(&rx.borrow_and_update());
|
||||
|
||||
let is_broken = tuple.1;
|
||||
let mut counted_broken = if !is_broken {
|
||||
// the tenant might be ignored and reloaded, so first remove any previous set
|
||||
// element. it most likely has already been scraped, as these are manual operations
|
||||
// right now. most likely we will add it back very soon.
|
||||
drop(
|
||||
crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
|
||||
);
|
||||
false
|
||||
} else {
|
||||
let mut counted_broken = if is_broken {
|
||||
// add the id to the set right away, there should not be any updates on the channel
|
||||
// after
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.set(1);
|
||||
// after before tenant is removed, if ever
|
||||
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
loop {
|
||||
@@ -2671,10 +2672,9 @@ impl Tenant {
|
||||
current.inc();
|
||||
|
||||
if rx.changed().await.is_err() {
|
||||
// tenant has been dropped; decrement the counter because a tenant with that
|
||||
// state is no longer in tenant map, but allow any broken set item to exist
|
||||
// still.
|
||||
// tenant has been dropped
|
||||
current.dec();
|
||||
drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2684,10 +2684,9 @@ impl Tenant {
|
||||
let is_broken = tuple.1;
|
||||
if is_broken && !counted_broken {
|
||||
counted_broken = true;
|
||||
// insert the tenant_id (back) into the set
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.inc();
|
||||
// insert the tenant_id (back) into the set while avoiding needless counter
|
||||
// access
|
||||
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
)
|
||||
|
||||
@@ -742,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
|
||||
def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
@@ -761,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
|
||||
client.tenant_break(env.initial_tenant)
|
||||
|
||||
found_broken = False
|
||||
active, broken, broken_set = ([], [], [])
|
||||
for _ in range(10):
|
||||
def found_broken():
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
|
||||
assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
|
||||
|
||||
if found_broken:
|
||||
break
|
||||
log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
|
||||
time.sleep(0.5)
|
||||
assert (
|
||||
found_broken
|
||||
), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
|
||||
wait_until(10, 0.5, found_broken)
|
||||
|
||||
client.tenant_ignore(env.initial_tenant)
|
||||
|
||||
found_broken = False
|
||||
broken, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
def found_cleaned_up():
|
||||
m = client.get_metrics()
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
|
||||
assert only_int(broken) == 0 and len(broken_set) == 0
|
||||
|
||||
if found_broken:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
||||
wait_until(10, 0.5, found_cleaned_up)
|
||||
|
||||
env.pageserver.tenant_load(env.initial_tenant)
|
||||
|
||||
found_active = False
|
||||
active, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
def found_active():
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
found_active = only_int(active) == 1 and len(broken_set) == 0
|
||||
assert only_int(active) == 1 and len(broken_set) == 0
|
||||
|
||||
if found_active:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
|
||||
wait_until(10, 0.5, found_active)
|
||||
|
||||
Reference in New Issue
Block a user