Compare commits

...

1 Commits

Author SHA1 Message Date
Arseny Sher
459b37a878 safekeeper: more timeline manager observability
We seem to have an issue with active timelines set being larger than it
should. Add more observability around it:
- measure total number of running timeline managers
- measure total number of cancelled timelines
- measure number of timelines in the active set directly
  (safekeeper_active_timelines excludes cancelled timelines).

Previously get_all() from global map excluded cancelled timelines. This is not
great because both safekeeper_timelines and debug dumps missed them; fix it.
2024-06-06 07:32:36 +03:00
4 changed files with 56 additions and 8 deletions

View File

@@ -12,8 +12,8 @@ use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
IntGaugeVec,
register_int_counter_vec, register_int_gauge, Gauge, IntCounter, IntCounterPair,
IntCounterPairVec, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -163,6 +163,13 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
});
pub static MANAGERS_RUNNING: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_managers_running",
"Number of timeline managers running. Should match safekeeper_timelines minus safekeeper_timelines_cancelled."
)
.expect("failed to define a metric")
});
pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"safekeeper_manager_iterations_total",
@@ -385,7 +392,9 @@ pub struct TimelineCollector {
flushed_wal_seconds: GaugeVec,
collect_timeline_metrics: Gauge,
timelines_count: IntGauge,
timelines_cancelled_count: IntGauge,
active_timelines_count: IntGauge,
active_timelines_set_size: IntGauge,
}
impl Default for TimelineCollector {
@@ -573,11 +582,18 @@ impl TimelineCollector {
let timelines_count = IntGauge::new(
"safekeeper_timelines",
"Total number of timelines loaded in-memory",
"Total number of timelines loaded in-memory, including cancelled (deleted) ones.",
)
.unwrap();
descs.extend(timelines_count.desc().into_iter().cloned());
let timelines_cancelled_count = IntGauge::new(
"safekeeper_timelines_cancelled",
"Number of cancelled timelines loaded in-memory",
)
.unwrap();
descs.extend(timelines_cancelled_count.desc().into_iter().cloned());
let active_timelines_count = IntGauge::new(
"safekeeper_active_timelines",
"Total number of active timelines",
@@ -585,6 +601,13 @@ impl TimelineCollector {
.unwrap();
descs.extend(active_timelines_count.desc().into_iter().cloned());
let active_timelines_set_size = IntGauge::new(
"safekeeper_active_timelines_set_size",
"Size of the active timelines hashset. Should match safekeeper_active_timelines metric.",
)
.unwrap();
descs.extend(active_timelines_set_size.desc().into_iter().cloned());
TimelineCollector {
descs,
commit_lsn,
@@ -606,7 +629,9 @@ impl TimelineCollector {
flushed_wal_seconds,
collect_timeline_metrics,
timelines_count,
timelines_cancelled_count,
active_timelines_count,
active_timelines_set_size,
}
}
}
@@ -759,10 +784,19 @@ impl Collector for TimelineCollector {
self.timelines_count.set(timelines_count as i64);
mfs.extend(self.timelines_count.collect());
// report number of cancelled timelines
self.timelines_cancelled_count
.set(GlobalTimelines::get_num_cancelled() as i64);
mfs.extend(self.timelines_cancelled_count.collect());
self.active_timelines_count
.set(active_timelines_count as i64);
mfs.extend(self.active_timelines_count.collect());
self.active_timelines_set_size
.set(GlobalTimelines::get_global_broker_active_set().get_len() as i64);
mfs.extend(self.active_timelines_set_size.collect());
mfs
}
}

View File

@@ -15,7 +15,7 @@ use utils::lsn::Lsn;
use crate::{
control_file::Storage,
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
metrics::{MANAGERS_RUNNING, MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
recovery::recovery_main,
remove_wal::calc_horizon_lsn,
send_wal::WalSenders,
@@ -83,7 +83,10 @@ pub async fn main_task(
conf: SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
) {
MANAGERS_RUNNING.inc();
scopeguard::defer! {
MANAGERS_RUNNING.dec();
if tli.is_cancelled() {
info!("manager task finished");
} else {

View File

@@ -296,15 +296,21 @@ impl GlobalTimelines {
}
}
/// Returns all timelines. This is used for background timeline processes.
/// Returns all timelines. Note that it *includes* cancelled (deleted)
/// timelines.
pub fn get_all() -> Vec<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
global_lock.timelines.values().cloned().collect()
}
/// Get number of cancelled timelines.
pub fn get_num_cancelled() -> usize {
let global_lock = TIMELINES_STATE.lock().unwrap();
global_lock
.timelines
.values()
.filter(|t| !t.is_cancelled())
.cloned()
.collect()
.filter(|t| t.is_cancelled())
.count()
}
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,

View File

@@ -49,6 +49,11 @@ impl TimelinesSet {
self.timelines.lock().unwrap().values().cloned().collect()
}
/// Returns size.
pub fn get_len(&self) -> usize {
self.timelines.lock().unwrap().len()
}
/// Returns a timeline guard for easy presence control.
pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
let is_present = self.is_present(&tli.ttid);