backup download_all_layers.py script

backing up the exact tokio-epoll-uring version used in the earlier (since reverted) integration commit
commit dde7c280e77dbb867d2fd459d629da2fd7b0edc6 (HEAD -> problame/wip-2023-10-17, origin/problame/wip-2023-10-17) Author: Christian Schwarz <me@cschwarz.com> Date: Tue Oct 17 10:09:48 2023 +0000 no info! logging (not sure this matters, tracing showed up in perf when integrating this branch into neon.git) The integration commit in this branch was: commit 61fac1ab0b Author: Christian Schwarz <me@cschwarz.com> Date: Tue Aug 29 19:13:38 2023 +0000 CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking
2026-05-14 03:30:36 +00:00 · 2023-10-26 08:14:08 +00:00 · 2023-10-17 10:12:22 +00:00 · 2023-10-10 18:16:58 +00:00 · 2023-10-10 17:55:54 +00:00 · 2023-10-10 17:52:32 +00:00
22 changed files with 874 additions and 344 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2393,6 +2393,17 @@ dependencies = [
 "minimal-lexical",
 ]

+[[package]]
+name = "nostarve_queue"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "rand",
+ "scopeguard",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "notify"
 version = "5.2.0"
@@ -2704,6 +2715,7 @@ dependencies = [
 "itertools",
 "metrics",
 "nix 0.26.2",
+ "nostarve_queue",
 "num-traits",
 "num_cpus",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/nostarve_queue",
 ]

 [workspace.package]
@@ -180,6 +181,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+nostarve_queue = { path = "./libs/nostarve_queue" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 10;
+const RETRY_UNTIL_SECS: u64 = 40;
 const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
 const RETRY_INTERVAL_MILLIS: u64 = 100;
 const DOT_EVERY_RETRIES: u64 = 10;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -18,7 +18,7 @@ use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, RequestBuilder, Response};
+use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::auth::{Claims, Scope};
@@ -93,7 +93,7 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: Client::new(),
+            http_client: ClientBuilder::new().timeout(None).build().unwrap(),
            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }
--- a/download_all_layers.py
+++ b/download_all_layers.py
@@ -0,0 +1,20 @@
+import requests
+
+tenants = requests.get("http://localhost:15003/v1/tenant")
+tenants.raise_for_status()
+tenants = tenants.json()
+
+for tenant in tenants:
+    id = tenant["id"]
+    timelines = requests.get(f"http://localhost:15003/v1/tenant/{id}/timeline")
+    timelines.raise_for_status()
+    for timeline in timelines.json():
+        tid = timeline["tenant_id"]
+        tlid = timeline["timeline_id"]
+        layers = requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer")
+        layers.raise_for_status()
+        layers = layers.json()
+        for l in layers["historic_layers"]:
+            if l["remote"] == False:
+                requests.get(f"http://localhost:15003/v1/tenant/{tid}/timeline/{tlid}/layer/{l['layer_file_name']}")
+
--- a/libs/nostarve_queue/Cargo.toml
+++ b/libs/nostarve_queue/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "nostarve_queue"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+scopeguard.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+futures.workspace = true
+rand.workspace = true
+tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
--- a/libs/nostarve_queue/src/lib.rs
+++ b/libs/nostarve_queue/src/lib.rs
@@ -0,0 +1,316 @@
+//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
+
+use std::{
+    collections::VecDeque,
+    fmt,
+    future::poll_fn,
+    sync::Mutex,
+    task::{Poll, Waker},
+};
+
+pub struct Queue<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+struct Inner<T> {
+    waiters: VecDeque<usize>,
+    free: VecDeque<usize>,
+    slots: Vec<Option<(Option<Waker>, Option<T>)>>,
+}
+
+#[derive(Clone, Copy)]
+pub struct Position<'q, T> {
+    idx: usize,
+    queue: &'q Queue<T>,
+}
+
+impl<T> fmt::Debug for Position<'_, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Position").field("idx", &self.idx).finish()
+    }
+}
+
+impl<T> Inner<T> {
+    #[cfg(not(test))]
+    #[inline]
+    fn integrity_check(&self) {}
+
+    #[cfg(test)]
+    fn integrity_check(&self) {
+        use std::collections::HashSet;
+        let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
+        let free = self.free.iter().copied().collect::<HashSet<_>>();
+        for (slot_idx, slot) in self.slots.iter().enumerate() {
+            match slot {
+                None => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(free.contains(&slot_idx));
+                }
+                Some((None, None)) => {
+                    assert!(waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((Some(_), Some(_))) => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((Some(_), None)) => {
+                    assert!(waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+                Some((None, Some(_))) => {
+                    assert!(!waiters.contains(&slot_idx));
+                    assert!(!free.contains(&slot_idx));
+                }
+            }
+        }
+    }
+}
+
+impl<T> Queue<T> {
+    pub fn new(size: usize) -> Self {
+        Queue {
+            inner: Mutex::new(Inner {
+                waiters: VecDeque::new(),
+                free: (0..size).collect(),
+                slots: {
+                    let mut v = Vec::with_capacity(size);
+                    v.resize_with(size, || None);
+                    v
+                },
+            }),
+        }
+    }
+    pub fn begin(&self) -> Result<Position<T>, ()> {
+        #[cfg(test)]
+        tracing::trace!("get in line locking inner");
+        let mut inner = self.inner.lock().unwrap();
+        inner.integrity_check();
+        let my_waitslot_idx = inner
+            .free
+            .pop_front()
+            .expect("can't happen, len(slots) = len(waiters");
+        inner.waiters.push_back(my_waitslot_idx);
+        let prev = inner.slots[my_waitslot_idx].replace((None, None));
+        assert!(prev.is_none());
+        inner.integrity_check();
+        Ok(Position {
+            idx: my_waitslot_idx,
+            queue: &self,
+        })
+    }
+}
+
+impl<'q, T> Position<'q, T> {
+    pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
+        #[cfg(test)]
+        tracing::trace!("found victim locking waiters");
+        let mut inner = self.queue.inner.lock().unwrap();
+        inner.integrity_check();
+        let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
+        #[cfg(test)]
+        tracing::trace!(winner_idx, "putting victim into next waiters slot");
+        let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
+        let prev = winner_slot.1.replace(datum);
+        assert!(
+            prev.is_none(),
+            "ensure we didn't mess up this simple ring buffer structure"
+        );
+        if let Some(waker) = winner_slot.0.take() {
+            #[cfg(test)]
+            tracing::trace!(winner_idx, "waking up winner");
+            waker.wake()
+        }
+        inner.integrity_check();
+        drop(inner); // the poll_fn locks it again
+
+        let mut poll_num = 0;
+        let mut drop_guard = Some(scopeguard::guard((), |()| {
+            panic!("must not drop this future until Ready");
+        }));
+
+        // take the victim that was found by someone else
+        poll_fn(move |cx| {
+            let my_waitslot_idx = self.idx;
+            poll_num += 1;
+            #[cfg(test)]
+            tracing::trace!(poll_num, "poll_fn locking waiters");
+            let mut inner = self.queue.inner.lock().unwrap();
+            inner.integrity_check();
+            let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
+            // assert!(
+            //     poll_num <= 2,
+            //     "once we place the waker in the slot, next wakeup should have a result: {}",
+            //     my_waitslot.1.is_some()
+            // );
+            if let Some(res) = my_waitslot.1.take() {
+                #[cfg(test)]
+                tracing::trace!(poll_num, "have cache slot");
+                // above .take() resets the waiters slot to None
+                debug_assert!(my_waitslot.0.is_none());
+                debug_assert!(my_waitslot.1.is_none());
+                inner.slots[my_waitslot_idx] = None;
+                inner.free.push_back(my_waitslot_idx);
+                let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
+                inner.integrity_check();
+                return Poll::Ready(res);
+            }
+            // assert_eq!(poll_num, 1);
+            if !my_waitslot
+                .0
+                .as_ref()
+                .map(|existing| cx.waker().will_wake(existing))
+                .unwrap_or(false)
+            {
+                let prev = my_waitslot.0.replace(cx.waker().clone());
+                #[cfg(test)]
+                tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
+            }
+            inner.integrity_check();
+            #[cfg(test)]
+            tracing::trace!(poll_num, "waiting to be woken up");
+            Poll::Pending
+        })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        sync::{
+            atomic::{AtomicBool, Ordering},
+            Arc,
+        },
+        task::Poll,
+        time::Duration,
+    };
+
+    use rand::RngCore;
+
+    #[tokio::test]
+    async fn in_order_completion_and_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        assert_eq!(q1.complete_and_wait(23).await, 23);
+        assert_eq!(q2.complete_and_wait(42).await, 42);
+    }
+
+    #[tokio::test]
+    async fn out_of_order_completion_and_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        let mut q2compfut = q2.complete_and_wait(23);
+
+        match futures::poll!(&mut q2compfut) {
+            Poll::Pending => {}
+            Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
+        }
+
+        let q1res = q1.complete_and_wait(42).await;
+        assert_eq!(q1res, 23);
+
+        let q2res = q2compfut.await;
+        assert_eq!(q2res, 42);
+    }
+
+    #[tokio::test]
+    async fn in_order_completion_out_of_order_wait() {
+        let queue = super::Queue::new(2);
+
+        let q1 = queue.begin().unwrap();
+        let q2 = queue.begin().unwrap();
+
+        let mut q1compfut = q1.complete_and_wait(23);
+
+        let mut q2compfut = q2.complete_and_wait(42);
+
+        match futures::poll!(&mut q2compfut) {
+            Poll::Pending => {
+                unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
+            }
+            Poll::Ready(x) => assert_eq!(x, 42),
+        }
+
+        assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn stress() {
+        let ntasks = 8;
+        let queue_size = 8;
+        let queue = Arc::new(super::Queue::new(queue_size));
+
+        let stop = Arc::new(AtomicBool::new(false));
+
+        let mut tasks = vec![];
+        for i in 0..ntasks {
+            let jh = tokio::spawn({
+                let queue = Arc::clone(&queue);
+                let stop = Arc::clone(&stop);
+                async move {
+                    while !stop.load(Ordering::Relaxed) {
+                        let q = queue.begin().unwrap();
+                        for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
+                            std::hint::spin_loop();
+                        }
+                        q.complete_and_wait(i).await;
+                        tokio::task::yield_now().await;
+                    }
+                }
+            });
+            tasks.push(jh);
+        }
+
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        stop.store(true, Ordering::Relaxed);
+
+        for t in tasks {
+            t.await.unwrap();
+        }
+    }
+
+    #[test]
+    fn stress_two_runtimes_shared_queue() {
+        std::thread::scope(|s| {
+            let ntasks = 8;
+            let queue_size = 8;
+            let queue = Arc::new(super::Queue::new(queue_size));
+
+            let stop = Arc::new(AtomicBool::new(false));
+
+            for i in 0..ntasks {
+                s.spawn({
+                    let queue = Arc::clone(&queue);
+                    let stop = Arc::clone(&stop);
+                    move || {
+                        let rt = tokio::runtime::Builder::new_current_thread()
+                            .enable_all()
+                            .build()
+                            .unwrap();
+                        rt.block_on(async move {
+                            while !stop.load(Ordering::Relaxed) {
+                                let q = queue.begin().unwrap();
+                                for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
+                                    std::hint::spin_loop();
+                                }
+                                q.complete_and_wait(i).await;
+                                tokio::task::yield_now().await;
+                            }
+                        });
+                    }
+                });
+            }
+
+            std::thread::sleep(Duration::from_secs(10));
+
+            stop.store(true, Ordering::Relaxed);
+        });
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,6 +37,7 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
+nostarve_queue.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
 num-traits.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -580,6 +580,31 @@ fn start_pageserver(
        );
    }

+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::BackgroundRuntimeTurnaroundMeasure,
+        None,
+        None,
+        "background runtime turnaround measure",
+        true,
+        async move {
+            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
+            let server = server
+                .serve(hyper::service::make_service_fn(|_| async move {
+                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
+                        move |_: hyper::Request<hyper::Body>| async move {
+                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
+                                hyper::Body::from(format!("alive")),
+                            ))
+                        },
+                    ))
+                }))
+                .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            server.await?;
+            Ok(())
+        },
+    );
+
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,6 +2,7 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -143,7 +144,7 @@ pub async fn collect_metrics(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            metric_collection_interval,
-            "consumption_metrics_collect_metrics",
+            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
    }
 }
@@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we just use concurrent_background_tasks_rate_limit().
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker(
        crate::tenant::tasks::warn_when_period_overrun(
            tick_at.elapsed(),
            synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
+            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
    }
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -314,7 +314,6 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
-    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -1061,6 +1060,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });
+
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,8 +66,7 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -84,6 +83,7 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use tracing::instrument;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -253,6 +253,9 @@ pub struct PageCache {
    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
+
+    find_victim_waiters:
+        nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
 }

 struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
@@ -286,23 +289,25 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
+/// is expected to fill in the page contents and call mark_valid().
 pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+    state: PageWriteGuardState<'i>,
+}

-    _permit: PinnedSlotsPermit,
-
-    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
-    valid: bool,
+enum PageWriteGuardState<'i> {
+    Invalid {
+        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+        _permit: PinnedSlotsPermit,
+    },
+    Downgraded,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -310,25 +315,37 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.inner.buf
+        match &self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

 impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => todo!(),
+        }
    }
 }

-impl PageWriteGuard<'_> {
+impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
+    #[must_use]
+    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
+        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
+        match prev {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                PageReadGuard {
+                    _permit: Arc::new(_permit),
+                    slot_guard: inner.downgrade(),
+                }
+            }
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -339,11 +356,14 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                let self_key = inner.key.as_ref().unwrap();
+                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+                inner.key = None;
+            }
+            PageWriteGuardState::Downgraded => {}
        }
    }
 }
@@ -354,12 +374,6 @@ pub enum ReadBufResult<'a> {
    NotFound(PageWriteGuard<'a>),
 }

-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
 impl PageCache {
    //
    // Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -429,8 +443,9 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
+    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
    pub async fn memorize_materialized_page(
-        &self,
+        &'static self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -446,26 +461,84 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
+            debug_assert!(permit.is_some());

-        Ok(())
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
    }

    // Section 1.2: Public interface functions for working with immutable file pages.

+    // #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
    pub async fn read_immutable_buf(
-        &self,
+        &'static self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -571,7 +644,7 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &self,
+        &'static self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
@@ -638,99 +711,10 @@ impl PageCache {
            );

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(
-        &self,
-        cache_key: &CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                debug_assert!(
-                    {
-                        let guard = inner.permit.lock().unwrap();
-                        guard.upgrade().is_none()
-                    },
-                    "we hold a write lock, so, no one else should have a permit"
-                );
-                return Some(PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
                    _permit: permit.take().unwrap(),
                    inner,
-                    valid: true,
-                });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
-                return Ok(WriteBufResult::Found(write_guard));
-            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
-            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
-                inner,
-                valid: false,
            }));
        }
    }
@@ -775,7 +759,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -882,10 +866,15 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &self,
+        &'static self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let iter_limit = self.slots.len() * 10;
+        let nostarve_position = self.find_victim_waiters.begin()
+            .expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
+
+        // let span = tracing::trace_span!("find_victim", ?nostarve_position);
+        // let _enter = span.enter();
+
        let mut iters = 0;
        loop {
            iters += 1;
@@ -897,41 +886,8 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > iter_limit {
-                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
-                            // any particular number of iterations: other threads might race ahead and acquire and
-                            // release pins just as we're scanning the array.
-                            //
-                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
-                            // slots. There are two threads running concurrently, A and B. A has just
-                            // acquired the permit from the semaphore.
-                            //
-                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
-                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //
-                            // Now we're back in the starting situation that both slots have
-                            // usage_count 1, but A has now been through one iteration of the
-                            // find_victim() loop. This can repeat indefinitely and on each
-                            // iteration, A's iteration count increases by one.
-                            //
-                            // So, even though the semaphore for the permits is fair, the victim search
-                            // itself happens in parallel and is not fair.
-                            // Hence even with a permit, a task can theoretically be starved.
-                            // To avoid this, we'd need tokio to give priority to tasks that are holding
-                            // permits for longer.
-                            // Note that just yielding to tokio during iteration without such
-                            // priority boosting is likely counter-productive. We'd just give more opportunities
-                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
-                            );
-                            anyhow::bail!("exceeded evict iter limit");
+                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
+                            unreachable!("find_victim_waiters prevents starvation");
                        }
                        continue;
                    }
@@ -942,7 +898,8 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-                return Ok((slot_idx, inner));
+
+                return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
            }
        }
    }
@@ -986,6 +943,7 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+            find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
        }
    }
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -293,6 +293,8 @@ pub enum TaskKind {

    DebugTool,

+    BackgroundRuntimeTurnaroundMeasure,
+
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -186,26 +186,21 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_immutable_buf(self.file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        format!("Failed to read immutable buf: {e:#}"),
-                    )
-                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+        match cache
+            .read_immutable_buf(self.file_id, blknum, ctx)
+            .await
+            .map_err(|e| {
+                std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!("Failed to read immutable buf: {e:#}"),
+                )
+            })? {
+            ReadBufResult::Found(guard) => Ok(guard.into()),
+            ReadBufResult::NotFound(mut write_guard) => {
+                // Read the page from disk into the buffer
+                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                Ok(write_guard.mark_valid().into())
+            }
        }
    }
 }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,36 +72,32 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum, self.file.path, e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                    let buf: &mut [u8] = write_guard.deref_mut();
+                    debug_assert_eq!(buf.len(), PAGE_SZ);
+                    self.file
+                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -171,7 +167,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        write_guard.mark_valid();
+                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -864,11 +864,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -457,11 +457,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;

+static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+    once_cell::sync::Lazy::new(|| {
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let permits = usize::max(
+            1,
+            // while a lot of the work is done on spawn_blocking, we still do
+            // repartitioning in the async context. this should give leave us some workers
+            // unblocked to be blocked on other work, hopefully easing any outside visible
+            // effects of restarts.
+            //
+            // 6/8 is a guess; previously we ran with unlimited 8 and more from
+            // spawn_blocking.
+            (total_threads * 3).checked_div(4).unwrap_or(0),
+        );
+        assert_ne!(permits, 0, "we will not be adding in permits later");
+        assert!(
+            permits < total_threads,
+            "need threads avail for shorter work"
+        );
+        tokio::sync::Semaphore::new(permits)
+    });
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "snake_case")]
+pub(crate) enum BackgroundLoopKind {
+    Compaction,
+    Gc,
+    Eviction,
+    ConsumptionMetricsCollectMetrics,
+    ConsumptionMetricsSyntheticSizeWorker,
+}
+
+impl BackgroundLoopKind {
+    fn as_static_str(&self) -> &'static str {
+        let s: &'static str = self.into();
+        s
+    }
+}
+
+pub(crate) enum RateLimitError {
+    Cancelled,
+}
+
+pub(crate) async fn concurrent_background_tasks_rate_limit(
+    loop_kind: BackgroundLoopKind,
+    _ctx: &RequestContext,
+    cancel: &CancellationToken,
+) -> Result<impl Drop, RateLimitError> {
+    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+        .with_label_values(&[loop_kind.as_static_str()])
+        .inc();
+    scopeguard::defer!(
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
+    );
+    tokio::select! {
+        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+            match permit {
+                Ok(permit) => Ok(permit),
+                Err(_closed) => unreachable!("we never close the semaphore"),
+            }
+        },
+        _ = cancel.cancelled() => {
+            Err(RateLimitError::Cancelled)
+        }
+    }
+}
+
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
    tenant: &Arc<Tenant>,
@@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay(
 }

 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+pub(crate) fn warn_when_period_overrun(
+    elapsed: Duration,
+    period: Duration,
+    task: BackgroundLoopKind,
+) {
    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
    if elapsed >= period && period != Duration::ZERO {
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
-            task,
+            ?task,
            "task iteration took longer than the configured period"
        );
        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
            .inc();
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
+use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
@@ -158,7 +159,7 @@ pub struct Timeline {

    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
    /// Never changes for the lifetime of this [`Timeline`] object.
-    ///  
+    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,
@@ -684,37 +685,17 @@ impl Timeline {
    ) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

-        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-            once_cell::sync::Lazy::new(|| {
-                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-                let permits = usize::max(
-                    1,
-                    // while a lot of the work is done on spawn_blocking, we still do
-                    // repartitioning in the async context. this should give leave us some workers
-                    // unblocked to be blocked on other work, hopefully easing any outside visible
-                    // effects of restarts.
-                    //
-                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
-                    // spawn_blocking.
-                    (total_threads * 3).checked_div(4).unwrap_or(0),
-                );
-                assert_ne!(permits, 0, "we will not be adding in permits later");
-                assert!(
-                    permits < total_threads,
-                    "need threads avail for shorter work"
-                );
-                tokio::sync::Semaphore::new(permits)
-            });
-
        // this wait probably never needs any "long time spent" logging, because we already nag if
        // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = tokio::select! {
-            permit = CONCURRENT_COMPACTIONS.acquire() => {
-                permit
-            },
-            _ = cancel.cancelled() => {
-                return Ok(());
-            }
+        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Compaction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return Ok(()),
        };

        let last_record_lsn = self.get_last_record_lsn();
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,6 +30,7 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
+        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -129,7 +130,11 @@ impl Timeline {
                    ControlFlow::Continue(()) => (),
                }
                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::tenant::tasks::warn_when_period_overrun(
+                    elapsed,
+                    p.period,
+                    BackgroundLoopKind::Eviction,
+                );
                crate::metrics::EVICTION_ITERATION_DURATION
                    .get_metric_with_label_values(&[
                        &format!("{}", p.period.as_secs()),
@@ -150,6 +155,17 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

+        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        };
+
        // If we evict layers but keep cached values derived from those layers, then
        // we face a storm of on-demand downloads after pageserver restart.
        // The reason is that the restart empties the caches, and so, the values
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,7 +18,8 @@ use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::time::Instant;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -110,7 +111,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -142,7 +143,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().unwrap();
+                slot_guard = slot.inner.write().await;
                index = next;
                break;
            }
@@ -153,7 +154,7 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
+            // the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
            // distinguish the two.
            STORAGE_IO_TIME_METRIC
                .get(StorageIoOperation::CloseByReplace)
@@ -208,6 +209,29 @@ impl CrashsafeOverwriteError {
    }
 }

+/// Observe duration for the given storage I/O operation
+///
+/// Unlike `observe_closure_duration`, this supports async,
+/// where "support" means that we measure wall clock time.
+macro_rules! observe_duration {
+    ($op:expr, $($body:tt)*) => {{
+        let instant = Instant::now();
+        let result = $($body)*;
+        let elapsed = instant.elapsed().as_secs_f64();
+        STORAGE_IO_TIME_METRIC
+            .get($op)
+            .observe(elapsed);
+        result
+    }}
+}
+
+macro_rules! with_file {
+    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
+        let $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
+}
+
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -244,11 +268,9 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

-        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
-            .observe_closure_duration(|| open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -331,22 +353,24 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
-            .await?
+        with_file!(self, StorageIoOperation::Fsync, |file| file
+            .as_ref()
+            .sync_all())
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
-            .await?
+        with_file!(self, StorageIoOperation::Metadata, |file| file
+            .as_ref()
+            .metadata())
    }

-    /// Helper function that looks up the underlying File for this VirtualFile,
-    /// opening it and evicting some other File if necessary. It calls 'func'
-    /// with the physical File.
-    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
-    where
-        F: FnMut(&File) -> R,
-    {
+    /// Helper function internal to `VirtualFile` that looks up the underlying File,
+    /// opens it and evicts some other File if necessary. The passed parameter is
+    /// assumed to be a function available for the physical `File`.
+    ///
+    /// We are doing it via a macro as Rust doesn't support async closures that
+    /// take on parameters with lifetimes.
+    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -356,27 +380,23 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().unwrap();
+            let mut handle = *self.handle.read().await;
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().unwrap();
-                    if slot_guard.tag == handle.tag {
-                        if let Some(file) = &slot_guard.file {
-                            // Found a cached file descriptor.
-                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(STORAGE_IO_TIME_METRIC
-                                .get(op)
-                                .observe_closure_duration(|| func(file)));
-                        }
+                    let slot_guard = slot.inner.read().await;
+                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
+                        // Found a cached file descriptor.
+                        slot.recently_used.store(true, Ordering::Relaxed);
+                        return Ok(FileGuard { slot_guard });
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().unwrap();
+                let handle_guard = self.handle.write().await;

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -390,17 +410,10 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot();
+        let (handle, mut slot_guard) = open_files.find_victim_slot().await;

        // Open the physical file
-        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
-            .observe_closure_duration(|| self.open_options.open(&self.path))?;
-
-        // Perform the requested operation on it
-        let result = STORAGE_IO_TIME_METRIC
-            .get(op)
-            .observe_closure_duration(|| func(&file));
+        let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -408,7 +421,9 @@ impl VirtualFile {

        *handle_guard = handle;

-        Ok(result)
+        return Ok(FileGuard {
+            slot_guard: slot_guard.downgrade(),
+        });
    }

    pub fn remove(self) {
@@ -423,11 +438,9 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = self
-                    .with_file(StorageIoOperation::Seek, |mut file| {
-                        file.seek(SeekFrom::End(offset))
-                    })
-                    .await??
+                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
+                    .as_ref()
+                    .seek(SeekFrom::End(offset)))?
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -515,9 +528,9 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = self
-            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
-            .await?;
+        let result = with_file!(self, StorageIoOperation::Read, |file| file
+            .as_ref()
+            .read_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -527,9 +540,9 @@ impl VirtualFile {
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = self
-            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
-            .await?;
+        let result = with_file!(self, StorageIoOperation::Write, |file| file
+            .as_ref()
+            .write_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -539,6 +552,18 @@ impl VirtualFile {
    }
 }

+struct FileGuard<'a> {
+    slot_guard: RwLockReadGuard<'a, SlotInner>,
+}
+
+impl<'a> AsRef<File> for FileGuard<'a> {
+    fn as_ref(&self) -> &File {
+        // This unwrap is safe because we only create `FileGuard`s
+        // if we know that the file is Some.
+        self.slot_guard.file.as_ref().unwrap()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -571,20 +596,39 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
+        let handle = self.handle.get_mut();

-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
-            STORAGE_IO_TIME_METRIC
-                .get(StorageIoOperation::Close)
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
+            if slot_guard.tag == tag {
+                slot.recently_used.store(false, Ordering::Relaxed);
+                // there is also the `CloseByReplace` operation for closes done on eviction for
+                // comparison.
+                STORAGE_IO_TIME_METRIC
+                    .get(StorageIoOperation::Close)
+                    .observe_closure_duration(|| drop(slot_guard.file.take()));
+            }
        }
+
+        // We don't have async drop so we cannot directly await the lock here.
+        // Instead, first do a best-effort attempt at closing the underlying
+        // file descriptor by using `try_write`, and if that fails, spawn
+        // a tokio task to do it asynchronously: we just want it to be
+        // cleaned up eventually.
+        // Most of the time, the `try_lock` should succeed though,
+        // as we have `&mut self` access. In other words, if the slot
+        // is still occupied by our file, there should be no access from
+        // other I/O operations; the only other possible place to lock
+        // the slot is the lock algorithm looking for free slots.
+        let slot = &get_open_files().slots[handle.index];
+        if let Ok(slot_guard) = slot.inner.try_write() {
+            clean_slot(slot, slot_guard, handle.tag);
+        } else {
+            let tag = handle.tag;
+            tokio::spawn(async move {
+                let slot_guard = slot.inner.write().await;
+                clean_slot(slot, slot_guard, tag);
+            });
+        };
    }
 }

--- a/test_runner/performance/test_pageserver_startup_many_tenants.py
+++ b/test_runner/performance/test_pageserver_startup_many_tenants.py
@@ -0,0 +1,52 @@
+import queue
+import threading
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.types import TenantId
+
+"""
+553  sudo mkfs.ext4 /dev/nvme1n1
+555  mkdir test_output
+556  sudo mount /dev/nvme1n1 test_output
+557  htop
+559  ./scripts/pysync
+560  NEON_BIN=/home/admin/neon/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
+561  sudo chown -R admin:admin test_output
+
+cargo build_testing --release
+
+562  NEON_BIN=$PWD/target/release DEFAULT_PG_VERSION=15 ./scripts/pytest --preserve-database-files --timeout=0 ./test_runner/performance/test_pageserver_startup_many_tenants.py
+
+cd test_output/test_pageserver_startup_many_tenants/repo
+
+sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local start
+# watch initial load complete, then background jobs start. That's the interesting part.
+sudo env  NEON_REPO_DIR=$PWD prlimit --nofile=300000:300000  ../../../target/release/neon_local stop
+# usually pageserver won't be responsive, kill with
+sudo pkill -9 pageserver
+"""
+def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    #  below doesn't work because summaries contain tenant and timeline ids and we check for them
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
+    pshttp = env.pageserver.http_client()
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table foo(b text)")
+    for i in range(0, 8):
+        ep.safe_psql("insert into foo(b) values ('some text')")
+        # pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+        pshttp.timeline_checkpoint(tenant_id, timeline_id)
+    ep.stop_and_destroy()
+
+    env.pageserver.stop()
+    for sk in env.safekeepers:
+        sk.stop()
+
+    tenant_dir = env.repo_dir / "pageserver_1" / "tenants" / str(env.initial_tenant)
+
+    for i in range(0, 20_000):
+        import shutil
+
+        shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
Author	SHA1	Message	Date
Christian Schwarz	4b7fddeabe	backup download_all_layers.py script	2023-10-26 08:14:08 +00:00
Christian Schwarz	68386c19a2	backing up the exact tokio-epoll-uring version used in the earlier (since reverted) integration commit commit dde7c280e77dbb867d2fd459d629da2fd7b0edc6 (HEAD -> problame/wip-2023-10-17, origin/problame/wip-2023-10-17) Author: Christian Schwarz <me@cschwarz.com> Date: Tue Oct 17 10:09:48 2023 +0000 no info! logging (not sure this matters, tracing showed up in perf when integrating this branch into neon.git) The integration commit in this branch was: commit `61fac1ab0b` Author: Christian Schwarz <me@cschwarz.com> Date: Tue Aug 29 19:13:38 2023 +0000 CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking	2023-10-17 10:12:22 +00:00
Christian Schwarz	db787dd6e0	backing up pageserver.toml used in experiments remote_storage ={local_path='/home/admin/neon-main/test_output/test_pageserver_startup_many_tenants/repo/local_fs_remote_storage/pageserver'} id =1 pg_distrib_dir ='/home/admin/neon-main/pg_install' http_auth_type ='Trust' pg_auth_type ='Trust' listen_http_addr ='localhost:15003' listen_pg_addr ='localhost:15002' broker_endpoint ='http://127.0.0.1:15001/' # 2023-10-10 17:46 UTC #page_cache_size = 16384 #max_file_descriptors = 2000 # 2023-10-10 18:10 UTC page_cache_size = 819200 max_file_descriptors = 100000 # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '127.0.0.1:64000' #listen_http_addr = '127.0.0.1:9898' #wait_lsn_timeout = '60 s' #wal_redo_timeout = '60 s' #max_file_descriptors = 100 # initial superuser role name to use when creating a new tenant #initial_superuser_name = 'cloud_admin' #broker_endpoint = 'http://127.0.0.1:50051' #log_format = 'plain' #concurrent_tenant_size_logical_size_queries = '1' #metric_collection_interval = '10 min' #cached_metric_collection_interval = '0s' #synthetic_size_calculation_interval = '10 min' #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"} #background_task_maximum_delay = '10s' metric_collection_endpoint = "https://localtest.me:23423" metric_collection_interval = "10min" cached_metric_collection_interval = "0s" [tenant_config] eviction_policy = { kind = "LayerAccessThreshold" , period = "10m", threshold = "7d" }	2023-10-10 18:16:58 +00:00
Christian Schwarz	6c5e8c6bb6	backing up useful prometheus queries: http://localhost:9090/graph?g0.expr=rate(pageserver_getpage_reconstruct_seconds_count%5B20s%5D)&g0.tab=0&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g1.expr=sum(rate(pageserver_page_cache_find_victim_iters_total%5B20s%5D))%0A%2F%0Asum(rate(pageserver_getpage_reconstruct_seconds_count%5B20s%5D))&g1.tab=0&g1.stacked=0&g1.show_exemplars=0&g1.range_input=6h&g2.expr=pageserver_storage_operations_seconds_global_count%7Boperation!%3D%22load%20layer%20map%22%7D&g2.tab=0&g2.stacked=0&g2.show_exemplars=0&g2.range_input=30m&g3.expr=sum(rate(pageserver_background_loop_period_overrun_count%5B20s%5D))%20by%20(task%2Cperiod)&g3.tab=0&g3.stacked=0&g3.show_exemplars=0&g3.range_input=1h&g4.expr=pageserver_background_loop_semaphore_wait_start_count%0A-%0Apageserver_background_loop_semaphore_wait_finish_count&g4.tab=0&g4.stacked=0&g4.show_exemplars=0&g4.range_input=1h	2023-10-10 17:55:54 +00:00
Christian Schwarz	c5259dcf32	WIP++ v2 limit eviction task concurrency: metric & enum	2023-10-10 17:52:32 +00:00
Christian Schwarz	112008519c	HACK: BACKGROUND_RUNTIME webserver to measure response time using `wrk`	2023-10-10 13:37:16 +00:00
Christian Schwarz	5917a54719	Revert "WIP: tracing-flame support" This reverts commit `dbe3290f89`.	2023-10-10 13:35:55 +00:00
Christian Schwarz	dbe3290f89	WIP: tracing-flame support	2023-10-10 12:17:55 +00:00
Christian Schwarz	bfcde8f9e6	WIP v2 limit eviction task concurrency This reverts commit `55106aa981`.	2023-10-10 12:17:55 +00:00
Christian Schwarz	dbb8377983	Revert "CP tokio_epoll_uring for read path" This reverts commit `1556234d9a`.	2023-10-10 12:17:55 +00:00
Christian Schwarz	d91539b888	Revert "CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking" This reverts commit `61fac1ab0b`.	2023-10-10 12:17:41 +00:00
Christian Schwarz	61fac1ab0b	CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking This makes Delta/Image ::load fns fully tokio-epoll-uring	2023-10-10 11:56:31 +00:00
Christian Schwarz	8d3e8078f7	comment out any spans in page cache	2023-10-10 11:56:31 +00:00
Christian Schwarz	373fa7c2ac	origin/problame/page-cache-forward-progress/3: trace spans and events only for tests	2023-10-09 20:21:22 +00:00
Christian Schwarz	1556234d9a	CP tokio_epoll_uring for read path	2023-10-09 20:20:59 +00:00
Christian Schwarz	55106aa981	Revert "WIP limit eviction task concurrency" This reverts commit `64680b1373`.	2023-10-09 19:47:17 +00:00
Christian Schwarz	64680b1373	WIP limit eviction task concurrency	2023-10-09 19:47:04 +00:00
Christian Schwarz	b86cd24a23	disable concurrent compaction limit (it wasn't there when I first analyzed the issue)	2023-10-09 19:29:47 +00:00
Christian Schwarz	d85baac608	REPRO: rebase fallout & add some instructions	2023-10-09 19:10:28 +00:00
Christian Schwarz	f06f274b38	REPRO the problem: , uses 430GB of space; 4 seconds load time; constant 20kIOPS after ~20s	2023-10-09 19:10:22 +00:00
Christian Schwarz	d98575f5a6	Revert "revert recent VirtualFile asyncification changes (#5291 )" This reverts commit `ab1f37e908`.	2023-10-09 19:02:59 +00:00
Christian Schwarz	33d0072342	move into library	2023-10-09 21:02:27 +02:00
Christian Schwarz	174bceccb1	commented out the check for just-once-polled, works now, don't understand why though	2023-10-09 19:26:47 +02:00
Christian Schwarz	f5bbba5014	fixes	2023-10-09 17:54:44 +02:00
Christian Schwarz	868cf8aeb5	hand-roll it instead	2023-10-06 18:45:41 +02:00
Christian Schwarz	9f03dd24c2	page_cache: find_victim: prevent starvation	2023-10-05 16:54:02 +02:00
Christian Schwarz	dc96a7604a	page_cache: ensure forward progress on cache miss	2023-10-05 16:51:08 +02:00
Christian Schwarz	d7c94e67ce	inline lock_for_write and try_lock_for_write into memorize_materialized_page Motivation ========== It's the only user, and the name of `_for_write` is wrong as of commit `7a63685cde` Author: Christian Schwarz <christian@neon.tech> Date: Fri Aug 18 19:31:03 2023 +0200 simplify page-caching of EphemeralFile (#4994) Notes ===== This also allows us to get rid of the WriteBufResult type. Also rename `search_mapping_for_write` to `search_mapping_exact`. It makes more sense that way because there is `_for_write`-locking anymore.	2023-10-05 16:01:29 +02:00