pageserver: rate limited warning on too many layers visited

This commit is contained in:
Vlad Lazar
2024-04-22 17:30:21 +01:00
parent 7a8effc190
commit 82e3866c27
2 changed files with 50 additions and 3 deletions

View File

@@ -5,6 +5,7 @@ use std::time::{Duration, Instant};
pub struct RateLimit {
last: Option<Instant>,
interval: Duration,
rate_limited: u32,
}
impl RateLimit {
@@ -12,9 +13,16 @@ impl RateLimit {
Self {
last: None,
interval,
rate_limited: 0,
}
}
/// Returns the number of calls that were rate limited
/// since the last one was allowed.
pub fn rate_limited(&self) -> u32 {
self.rate_limited
}
/// Call `f` if the rate limit allows.
/// Don't call it otherwise.
pub fn call<F: FnOnce()>(&mut self, f: F) {
@@ -22,9 +30,13 @@ impl RateLimit {
match self.last {
Some(last) if now - last <= self.interval => {
// ratelimit
if let Some(updated) = self.rate_limited.checked_add(1) {
self.rate_limited = updated;
}
}
_ => {
self.last = Some(now);
self.rate_limited = 0;
f();
}
}
@@ -50,17 +62,24 @@ mod tests {
f.call(cl);
assert_eq!(called.load(Relaxed), 1);
assert_eq!(f.rate_limited(), 0);
f.call(cl);
assert_eq!(called.load(Relaxed), 1);
assert_eq!(f.rate_limited(), 1);
f.call(cl);
assert_eq!(called.load(Relaxed), 1);
assert_eq!(f.rate_limited(), 2);
std::thread::sleep(Duration::from_millis(100));
f.call(cl);
assert_eq!(called.load(Relaxed), 2);
assert_eq!(f.rate_limited(), 0);
f.call(cl);
assert_eq!(called.load(Relaxed), 2);
assert_eq!(f.rate_limited(), 1);
f.call(cl);
std::thread::sleep(Duration::from_millis(100));
f.call(cl);
assert_eq!(called.load(Relaxed), 3);
assert_eq!(f.rate_limited(), 0);
}
}

View File

@@ -824,6 +824,10 @@ impl Timeline {
}
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
// If we are visiting more than 50 layers to reconstruct one value,
// it's likely that something is wrong with compaction. We print a
// rate limited warning in that case.
pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u64 = 50;
/// Look up multiple page versions at a given LSN
///
@@ -975,8 +979,21 @@ impl Timeline {
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
let average_layers_visited = layers_visited as f64 / results.len() as f64;
crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(average_layers_visited);
if average_layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD as f64 {
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut rate_limit = LOGGED.lock().unwrap();
let rate_limited = rate_limit.rate_limited();
rate_limit.call(|| {
warn!("Too many layers visited by vectored read: {} >= {}; rate limited {} occurences",
average_layers_visited, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
});
}
Ok(results)
}
@@ -2801,7 +2818,18 @@ impl Timeline {
let mut timeline = self;
let mut read_count = scopeguard::guard(0, |cnt| {
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64);
if cnt >= Self::LAYERS_VISITED_WARN_THRESHOLD {
use utils::rate_limit::RateLimit;
static LOGGED: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
let mut rate_limit = LOGGED.lock().unwrap();
let rate_limited = rate_limit.rate_limited();
rate_limit.call(|| {
warn!("Too many layers visited by non-vectored read: {} >= {}; rate limited {} occurences",
cnt, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
});
}
});
// For debugging purposes, collect the path of layers that we traversed