diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index aa2bafbe92..e8fdddcdc1 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -12,8 +12,8 @@ use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, register_histogram_vec, register_int_counter, register_int_counter_pair, - register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter, - IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, + register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, + HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy = Lazy::new(|| .expect("Failed to register metric") }); +pub static NUM_EVICTED_TIMELINES: Lazy = Lazy::new(|| { + register_int_gauge!( + "safekeeper_evicted_timelines", + "Number of currently evicted timelines" + ) + .expect("Failed to register metric") +}); + pub const LABEL_UNKNOWN: &str = "unknown"; /// Labels for traffic metrics. diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index fb98534768..3494b0b764 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -631,13 +631,19 @@ impl Timeline { return Err(e); } - self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter); + self.bootstrap( + shared_state, + conf, + broker_active_set, + partial_backup_rate_limiter, + ); Ok(()) } /// Bootstrap new or existing timeline starting background tasks. pub fn bootstrap( self: &Arc, + _shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, broker_active_set: Arc, partial_backup_rate_limiter: RateLimiter, diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 5aa4921a92..fae6571277 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn}; use utils::crashsafe::durable_rename; use crate::{ - metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED}, + metrics::{ + EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES, + }, rate_limit::rand_duration, timeline_manager::{Manager, StateSnapshot}, wal_backup, @@ -93,6 +95,7 @@ impl Manager { } info!("successfully evicted timeline"); + NUM_EVICTED_TIMELINES.inc(); } /// Attempt to restore evicted timeline from remote storage; it must be @@ -128,6 +131,7 @@ impl Manager { tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident); info!("successfully restored evicted timeline"); + NUM_EVICTED_TIMELINES.dec(); } } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index f5535c0cea..2129e86baa 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -25,7 +25,10 @@ use utils::lsn::Lsn; use crate::{ control_file::{FileStorage, Storage}, - metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, + metrics::{ + MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, + NUM_EVICTED_TIMELINES, + }, rate_limit::{rand_duration, RateLimiter}, recovery::recovery_main, remove_wal::calc_horizon_lsn, @@ -251,6 +254,11 @@ pub async fn main_task( mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone()))); } + // If timeline is evicted, reflect that in the metric. + if mgr.is_offloaded { + NUM_EVICTED_TIMELINES.inc(); + } + let last_state = 'outer: loop { MANAGER_ITERATIONS_TOTAL.inc(); @@ -367,6 +375,11 @@ pub async fn main_task( mgr.update_wal_removal_end(res); } + // If timeline is deleted while evicted decrement the gauge. + if mgr.tli.is_cancelled() && mgr.is_offloaded { + NUM_EVICTED_TIMELINES.dec(); + } + mgr.set_status(Status::Finished); } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 6662e18817..866cde3339 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -165,12 +165,14 @@ impl GlobalTimelines { match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); + let mut shared_state = tli.write_shared_state().await; TIMELINES_STATE .lock() .unwrap() .timelines .insert(ttid, tli.clone()); tli.bootstrap( + &mut shared_state, &conf, broker_active_set.clone(), partial_backup_rate_limiter.clone(), @@ -213,6 +215,7 @@ impl GlobalTimelines { match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); + let mut shared_state = tli.write_shared_state().await; // TODO: prevent concurrent timeline creation/loading { @@ -227,8 +230,13 @@ impl GlobalTimelines { state.timelines.insert(ttid, tli.clone()); } - tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter); - + tli.bootstrap( + &mut shared_state, + &conf, + broker_active_set, + partial_backup_rate_limiter, + ); + drop(shared_state); Ok(tli) } // If we can't load a timeline, it's bad. Caller will figure it out. diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d372e2d461..d803cd7c78 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2314,12 +2314,12 @@ def test_s3_eviction( ] if delete_offloaded_wal: neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal") - - env = neon_env_builder.init_start( - initial_tenant_conf={ - "checkpoint_timeout": "100ms", - } - ) + # make lagging_wal_timeout small to force pageserver quickly forget about + # safekeeper after it stops sending updates (timeline is deactivated) to + # make test faster. Won't be needed with + # https://github.com/neondatabase/neon/issues/8148 fixed. + initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"} + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) n_timelines = 5 @@ -2407,9 +2407,37 @@ def test_s3_eviction( and sk.log_contains("successfully restored evicted timeline") for sk in env.safekeepers ) - assert event_metrics_seen + # test safekeeper_evicted_timelines metric + log.info("testing safekeeper_evicted_timelines metric") + # checkpoint pageserver to force remote_consistent_lsn update + for i in range(n_timelines): + ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True) + for ep in endpoints: + log.info(ep.is_running()) + sk = env.safekeepers[0] + + # all timelines must be evicted eventually + def all_evicted(): + n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines") + assert n_evicted # make mypy happy + assert int(n_evicted) == n_timelines + + wait_until(60, 0.5, all_evicted) + # restart should preserve the metric value + sk.stop().start() + wait_until(60, 0.5, all_evicted) + # and endpoint start should reduce is + endpoints[0].start() + + def one_unevicted(): + n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines") + assert n_evicted # make mypy happy + assert int(n_evicted) < n_timelines + + wait_until(60, 0.5, one_unevicted) + # Test resetting uploaded partial segment state. def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):