safekeeper: add evicted_timelines gauge. (#9318)

showing total number of evicted timelines.
This commit is contained in:
Arseny Sher
2024-10-09 14:40:30 +03:00
committed by GitHub
parent fc7397122c
commit a181392738
6 changed files with 81 additions and 14 deletions

View File

@@ -12,8 +12,8 @@ use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_histogram_vec, register_int_counter, register_int_counter_pair,
register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(||
.expect("Failed to register metric")
});
pub static NUM_EVICTED_TIMELINES: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_evicted_timelines",
"Number of currently evicted timelines"
)
.expect("Failed to register metric")
});
pub const LABEL_UNKNOWN: &str = "unknown";
/// Labels for traffic metrics.

View File

@@ -631,13 +631,19 @@ impl Timeline {
return Err(e);
}
self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
self.bootstrap(
shared_state,
conf,
broker_active_set,
partial_backup_rate_limiter,
);
Ok(())
}
/// Bootstrap new or existing timeline starting background tasks.
pub fn bootstrap(
self: &Arc<Timeline>,
_shared_state: &mut WriteGuardSharedState<'_>,
conf: &SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
partial_backup_rate_limiter: RateLimiter,

View File

@@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn};
use utils::crashsafe::durable_rename;
use crate::{
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
metrics::{
EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
},
rate_limit::rand_duration,
timeline_manager::{Manager, StateSnapshot},
wal_backup,
@@ -93,6 +95,7 @@ impl Manager {
}
info!("successfully evicted timeline");
NUM_EVICTED_TIMELINES.inc();
}
/// Attempt to restore evicted timeline from remote storage; it must be
@@ -128,6 +131,7 @@ impl Manager {
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
info!("successfully restored evicted timeline");
NUM_EVICTED_TIMELINES.dec();
}
}

View File

@@ -25,7 +25,10 @@ use utils::lsn::Lsn;
use crate::{
control_file::{FileStorage, Storage},
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
metrics::{
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
NUM_EVICTED_TIMELINES,
},
rate_limit::{rand_duration, RateLimiter},
recovery::recovery_main,
remove_wal::calc_horizon_lsn,
@@ -251,6 +254,11 @@ pub async fn main_task(
mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
}
// If timeline is evicted, reflect that in the metric.
if mgr.is_offloaded {
NUM_EVICTED_TIMELINES.inc();
}
let last_state = 'outer: loop {
MANAGER_ITERATIONS_TOTAL.inc();
@@ -367,6 +375,11 @@ pub async fn main_task(
mgr.update_wal_removal_end(res);
}
// If timeline is deleted while evicted decrement the gauge.
if mgr.tli.is_cancelled() && mgr.is_offloaded {
NUM_EVICTED_TIMELINES.dec();
}
mgr.set_status(Status::Finished);
}

View File

@@ -165,12 +165,14 @@ impl GlobalTimelines {
match Timeline::load_timeline(&conf, ttid) {
Ok(timeline) => {
let tli = Arc::new(timeline);
let mut shared_state = tli.write_shared_state().await;
TIMELINES_STATE
.lock()
.unwrap()
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(
&mut shared_state,
&conf,
broker_active_set.clone(),
partial_backup_rate_limiter.clone(),
@@ -213,6 +215,7 @@ impl GlobalTimelines {
match Timeline::load_timeline(&conf, ttid) {
Ok(timeline) => {
let tli = Arc::new(timeline);
let mut shared_state = tli.write_shared_state().await;
// TODO: prevent concurrent timeline creation/loading
{
@@ -227,8 +230,13 @@ impl GlobalTimelines {
state.timelines.insert(ttid, tli.clone());
}
tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
tli.bootstrap(
&mut shared_state,
&conf,
broker_active_set,
partial_backup_rate_limiter,
);
drop(shared_state);
Ok(tli)
}
// If we can't load a timeline, it's bad. Caller will figure it out.

View File

@@ -2314,12 +2314,12 @@ def test_s3_eviction(
]
if delete_offloaded_wal:
neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)
# make lagging_wal_timeout small to force pageserver quickly forget about
# safekeeper after it stops sending updates (timeline is deactivated) to
# make test faster. Won't be needed with
# https://github.com/neondatabase/neon/issues/8148 fixed.
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
n_timelines = 5
@@ -2407,9 +2407,37 @@ def test_s3_eviction(
and sk.log_contains("successfully restored evicted timeline")
for sk in env.safekeepers
)
assert event_metrics_seen
# test safekeeper_evicted_timelines metric
log.info("testing safekeeper_evicted_timelines metric")
# checkpoint pageserver to force remote_consistent_lsn update
for i in range(n_timelines):
ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
for ep in endpoints:
log.info(ep.is_running())
sk = env.safekeepers[0]
# all timelines must be evicted eventually
def all_evicted():
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted # make mypy happy
assert int(n_evicted) == n_timelines
wait_until(60, 0.5, all_evicted)
# restart should preserve the metric value
sk.stop().start()
wait_until(60, 0.5, all_evicted)
# and endpoint start should reduce is
endpoints[0].start()
def one_unevicted():
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted # make mypy happy
assert int(n_evicted) < n_timelines
wait_until(60, 0.5, one_unevicted)
# Test resetting uploaded partial segment state.
def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):