pageserver: rate limited warning on too many layers visited

2026-01-16 18:02:56 +00:00 · 2024-04-22 17:30:21 +01:00
parent 7a8effc190
commit 82e3866c27
2 changed files with 50 additions and 3 deletions
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,6 +5,7 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
    last: Option<Instant>,
    interval: Duration,
+    rate_limited: u32,
 }

 impl RateLimit {
@@ -12,9 +13,16 @@ impl RateLimit {
        Self {
            last: None,
            interval,
+            rate_limited: 0,
        }
    }

+    /// Returns the number of calls that were rate limited
+    /// since the last one was allowed.
+    pub fn rate_limited(&self) -> u32 {
+        self.rate_limited
+    }
+
    /// Call `f` if the rate limit allows.
    /// Don't call it otherwise.
    pub fn call<F: FnOnce()>(&mut self, f: F) {
@@ -22,9 +30,13 @@ impl RateLimit {
        match self.last {
            Some(last) if now - last <= self.interval => {
                // ratelimit
+                if let Some(updated) = self.rate_limited.checked_add(1) {
+                    self.rate_limited = updated;
+                }
            }
            _ => {
                self.last = Some(now);
+                self.rate_limited = 0;
                f();
            }
        }
@@ -50,17 +62,24 @@ mod tests {

        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 0);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 1);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 2);
        std::thread::sleep(Duration::from_millis(100));
        f.call(cl);
        assert_eq!(called.load(Relaxed), 2);
+        assert_eq!(f.rate_limited(), 0);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 2);
+        assert_eq!(f.rate_limited(), 1);
+        f.call(cl);
        std::thread::sleep(Duration::from_millis(100));
        f.call(cl);
        assert_eq!(called.load(Relaxed), 3);
+        assert_eq!(f.rate_limited(), 0);
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -824,6 +824,10 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+    // If we are visiting more than 50 layers to reconstruct one value,
+    // it's likely that something is wrong with compaction. We print a
+    // rate limited warning in that case.
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u64 = 50;

    /// Look up multiple page versions at a given LSN
    ///
@@ -975,8 +979,21 @@ impl Timeline {
        // Note that this is an approximation. Tracking the exact number of layers visited
        // per key requires virtually unbounded memory usage and is inefficient
        // (i.e. segment tree tracking each range queried from a layer)
-        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-            .observe(layers_visited as f64 / results.len() as f64);
+        let average_layers_visited = layers_visited as f64 / results.len() as f64;
+
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(average_layers_visited);
+
+        if average_layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD as f64 {
+            use utils::rate_limit::RateLimit;
+            static LOGGED: Lazy<Mutex<RateLimit>> =
+                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+            let mut rate_limit = LOGGED.lock().unwrap();
+            let rate_limited = rate_limit.rate_limited();
+            rate_limit.call(|| {
+                warn!("Too many layers visited by vectored read: {} >= {}; rate limited {} occurences",
+                      average_layers_visited, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
+            });
+        }

        Ok(results)
    }
@@ -2801,7 +2818,18 @@ impl Timeline {
        let mut timeline = self;

        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64);
+            if cnt >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                let rate_limited = rate_limit.rate_limited();
+                rate_limit.call(|| {
+                    warn!("Too many layers visited by non-vectored read: {} >= {}; rate limited {} occurences",
+                          cnt, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
+                });
+            }
        });

        // For debugging purposes, collect the path of layers that we traversed