mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
safekeeper: add evicted_timelines gauge. (#9318)
showing total number of evicted timelines.
This commit is contained in:
@@ -12,8 +12,8 @@ use metrics::{
|
||||
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
|
||||
proto::MetricFamily,
|
||||
register_histogram_vec, register_int_counter, register_int_counter_pair,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
|
||||
IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
|
||||
HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
@@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static NUM_EVICTED_TIMELINES: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"safekeeper_evicted_timelines",
|
||||
"Number of currently evicted timelines"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub const LABEL_UNKNOWN: &str = "unknown";
|
||||
|
||||
/// Labels for traffic metrics.
|
||||
|
||||
@@ -631,13 +631,19 @@ impl Timeline {
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
|
||||
self.bootstrap(
|
||||
shared_state,
|
||||
conf,
|
||||
broker_active_set,
|
||||
partial_backup_rate_limiter,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Bootstrap new or existing timeline starting background tasks.
|
||||
pub fn bootstrap(
|
||||
self: &Arc<Timeline>,
|
||||
_shared_state: &mut WriteGuardSharedState<'_>,
|
||||
conf: &SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
|
||||
@@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn};
|
||||
use utils::crashsafe::durable_rename;
|
||||
|
||||
use crate::{
|
||||
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
|
||||
metrics::{
|
||||
EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
|
||||
},
|
||||
rate_limit::rand_duration,
|
||||
timeline_manager::{Manager, StateSnapshot},
|
||||
wal_backup,
|
||||
@@ -93,6 +95,7 @@ impl Manager {
|
||||
}
|
||||
|
||||
info!("successfully evicted timeline");
|
||||
NUM_EVICTED_TIMELINES.inc();
|
||||
}
|
||||
|
||||
/// Attempt to restore evicted timeline from remote storage; it must be
|
||||
@@ -128,6 +131,7 @@ impl Manager {
|
||||
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
|
||||
|
||||
info!("successfully restored evicted timeline");
|
||||
NUM_EVICTED_TIMELINES.dec();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,10 @@ use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
control_file::{FileStorage, Storage},
|
||||
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
|
||||
metrics::{
|
||||
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
|
||||
NUM_EVICTED_TIMELINES,
|
||||
},
|
||||
rate_limit::{rand_duration, RateLimiter},
|
||||
recovery::recovery_main,
|
||||
remove_wal::calc_horizon_lsn,
|
||||
@@ -251,6 +254,11 @@ pub async fn main_task(
|
||||
mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
|
||||
}
|
||||
|
||||
// If timeline is evicted, reflect that in the metric.
|
||||
if mgr.is_offloaded {
|
||||
NUM_EVICTED_TIMELINES.inc();
|
||||
}
|
||||
|
||||
let last_state = 'outer: loop {
|
||||
MANAGER_ITERATIONS_TOTAL.inc();
|
||||
|
||||
@@ -367,6 +375,11 @@ pub async fn main_task(
|
||||
mgr.update_wal_removal_end(res);
|
||||
}
|
||||
|
||||
// If timeline is deleted while evicted decrement the gauge.
|
||||
if mgr.tli.is_cancelled() && mgr.is_offloaded {
|
||||
NUM_EVICTED_TIMELINES.dec();
|
||||
}
|
||||
|
||||
mgr.set_status(Status::Finished);
|
||||
}
|
||||
|
||||
|
||||
@@ -165,12 +165,14 @@ impl GlobalTimelines {
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
let mut shared_state = tli.write_shared_state().await;
|
||||
TIMELINES_STATE
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(ttid, tli.clone());
|
||||
tli.bootstrap(
|
||||
&mut shared_state,
|
||||
&conf,
|
||||
broker_active_set.clone(),
|
||||
partial_backup_rate_limiter.clone(),
|
||||
@@ -213,6 +215,7 @@ impl GlobalTimelines {
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
Ok(timeline) => {
|
||||
let tli = Arc::new(timeline);
|
||||
let mut shared_state = tli.write_shared_state().await;
|
||||
|
||||
// TODO: prevent concurrent timeline creation/loading
|
||||
{
|
||||
@@ -227,8 +230,13 @@ impl GlobalTimelines {
|
||||
state.timelines.insert(ttid, tli.clone());
|
||||
}
|
||||
|
||||
tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
|
||||
|
||||
tli.bootstrap(
|
||||
&mut shared_state,
|
||||
&conf,
|
||||
broker_active_set,
|
||||
partial_backup_rate_limiter,
|
||||
);
|
||||
drop(shared_state);
|
||||
Ok(tli)
|
||||
}
|
||||
// If we can't load a timeline, it's bad. Caller will figure it out.
|
||||
|
||||
@@ -2314,12 +2314,12 @@ def test_s3_eviction(
|
||||
]
|
||||
if delete_offloaded_wal:
|
||||
neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"checkpoint_timeout": "100ms",
|
||||
}
|
||||
)
|
||||
# make lagging_wal_timeout small to force pageserver quickly forget about
|
||||
# safekeeper after it stops sending updates (timeline is deactivated) to
|
||||
# make test faster. Won't be needed with
|
||||
# https://github.com/neondatabase/neon/issues/8148 fixed.
|
||||
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
|
||||
|
||||
n_timelines = 5
|
||||
|
||||
@@ -2407,9 +2407,37 @@ def test_s3_eviction(
|
||||
and sk.log_contains("successfully restored evicted timeline")
|
||||
for sk in env.safekeepers
|
||||
)
|
||||
|
||||
assert event_metrics_seen
|
||||
|
||||
# test safekeeper_evicted_timelines metric
|
||||
log.info("testing safekeeper_evicted_timelines metric")
|
||||
# checkpoint pageserver to force remote_consistent_lsn update
|
||||
for i in range(n_timelines):
|
||||
ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
|
||||
for ep in endpoints:
|
||||
log.info(ep.is_running())
|
||||
sk = env.safekeepers[0]
|
||||
|
||||
# all timelines must be evicted eventually
|
||||
def all_evicted():
|
||||
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
|
||||
assert n_evicted # make mypy happy
|
||||
assert int(n_evicted) == n_timelines
|
||||
|
||||
wait_until(60, 0.5, all_evicted)
|
||||
# restart should preserve the metric value
|
||||
sk.stop().start()
|
||||
wait_until(60, 0.5, all_evicted)
|
||||
# and endpoint start should reduce is
|
||||
endpoints[0].start()
|
||||
|
||||
def one_unevicted():
|
||||
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
|
||||
assert n_evicted # make mypy happy
|
||||
assert int(n_evicted) < n_timelines
|
||||
|
||||
wait_until(60, 0.5, one_unevicted)
|
||||
|
||||
|
||||
# Test resetting uploaded partial segment state.
|
||||
def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
Reference in New Issue
Block a user