Revert "serialize find_victim callers through std mutex"

This reverts commit 74601238ee.
2026-05-15 20:20:38 +00:00 · 2023-10-04 16:47:50 +00:00
parent 3adaec3ab2
commit 6bfc0492ac
1 changed files with 43 additions and 6 deletions
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -250,7 +250,7 @@ pub struct PageCache {

    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
-    next_evict_slot: std::sync::Mutex<usize>,
+    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
 }
@@ -882,11 +882,11 @@ impl PageCache {
        &self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let mut next_evict_slot = self.next_evict_slot.lock().unwrap();
+        let iter_limit = self.slots.len() * 10;
+        let mut iters = 0;
        loop {
-            let slot_idx = *next_evict_slot % self.slots.len();
-            *next_evict_slot = next_evict_slot.wrapping_add(1);
-
+            iters += 1;
+            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();

            let slot = &self.slots[slot_idx];

@@ -894,6 +894,42 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
+                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
+                            anyhow::bail!("exceeded evict iter limit");
+                        }
                        continue;
                    }
                };
@@ -902,6 +938,7 @@ impl PageCache {
                    self.remove_mapping(old_key);
                    inner.key = None;
                }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
@@ -943,7 +980,7 @@ impl PageCache {
            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
-            next_evict_slot: std::sync::Mutex::new(0),
+            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
        }