measured BACKGROUND_RUNTIME performance using wrk

Launch wrk from command line 3-4 seconds after the load starts. => blocking of executor threads is clearly visible, my branch performs _much_ better. baseline: commit 15b8618d25 (HEAD -> problame/loadtest-baseline, origin/problame/loadtest-baseline, main) neon-main (compaction semaphore disabled!) admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342 Running 10s test @ http://localhost:2342 2 threads and 10 connections Thread Stats Avg Stdev Max +/- Stdev Latency 71.42ms 15.97ms 125.18ms 70.82% Req/Sec 41.44 28.85 101.00 57.35% Latency Distribution 50% 72.53ms 75% 82.07ms 90% 91.44ms 99% 116.56ms 291 requests in 10.01s, 22.73KB read Socket errors: connect 0, read 0, write 0, timeout 10 Requests/sec: 29.07 Transfer/sec: 2.27KB this branch (comapction semaphore also disabled!): admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342 Running 10s test @ http://localhost:2342 2 threads and 10 connections Thread Stats Avg Stdev Max +/- Stdev Latency 45.74ms 64.13ms 293.44ms 83.27% Req/Sec 442.81 258.18 1.32k 69.79% Latency Distribution 50% 2.92ms 75% 75.52ms 90% 148.03ms 99% 248.50ms 8641 requests in 10.01s, 675.08KB read Requests/sec: 862.81 Transfer/sec: 67.41KB
HACK: BACKGROUND_RUNTIME webserver to measure response time using wrk
2026-02-06 12:10:37 +00:00 · 2023-08-31 08:02:17 +00:00 · 2023-08-31 07:50:13 +00:00 · 2023-08-29 19:14:35 +00:00 · 2023-08-29 16:42:14 +00:00 · 2023-08-29 16:00:21 +00:00
49 changed files with 2850 additions and 4247 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1996,6 +1996,26 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "io-uring"
+version = "0.5.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd1e1a01cfb924fd8c5c43b6827965db394f5a3a16c599ce03452266e1cf984c"
+dependencies = [
+ "bitflags",
+ "libc",
+]
+
+[[package]]
+name = "io-uring"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "141a0f4546a50b2ed637c7a6df0d7dff45c9f41523254996764461c8ae0d9424"
+dependencies = [
+ "bitflags",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.7.2"
@@ -2095,9 +2115,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

 [[package]]
 name = "libc"
-version = "0.2.144"
+version = "0.2.147"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
+checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"

 [[package]]
 name = "libloading"
@@ -2663,6 +2683,7 @@ dependencies = [
 "tenant_size_model",
 "thiserror",
 "tokio",
+ "tokio-epoll-uring",
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-tar",
@@ -2836,9 +2857,9 @@ dependencies = [

 [[package]]
 name = "pin-project-lite"
-version = "0.2.9"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"

 [[package]]
 name = "pin-utils"
@@ -3725,6 +3746,12 @@ dependencies = [
 "windows-sys 0.42.0",
 ]

+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -4296,18 +4323,18 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
 dependencies = [
 "thiserror-impl",
 ]

 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -4392,22 +4419,40 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.28.1"
+version = "1.32.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
+checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9"
 dependencies = [
- "autocfg",
+ "backtrace",
 "bytes",
 "libc",
 "mio",
 "num_cpus",
+ "parking_lot 0.12.1",
 "pin-project-lite",
 "signal-hook-registry",
- "socket2 0.4.9",
+ "socket2 0.5.3",
 "tokio-macros",
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "tokio-epoll-uring"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=problame/hacky-openat#96e5a1f3a3d6921438002807475d01540e1211b2"
+dependencies = [
+ "futures",
+ "io-uring 0.6.1",
+ "libc",
+ "once_cell",
+ "scopeguard",
+ "thiserror",
+ "tokio",
+ "tokio-uring",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "tokio-io-timeout"
 version = "1.2.0"
@@ -4547,6 +4592,20 @@ dependencies = [
 "tungstenite 0.20.0",
 ]

+[[package]]
+name = "tokio-uring"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d5e02bb137e030b3a547c65a3bd2f1836d66a97369fdcc69034002b10e155ef"
+dependencies = [
+ "io-uring 0.5.13",
+ "libc",
+ "scoped-tls",
+ "slab",
+ "socket2 0.4.9",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.8"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -200,58 +200,3 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", re
 # Besides, debug info should not affect the performance.
 debug = true

-# disable debug symbols for all packages except this one to decrease binaries size
-[profile.release.package."*"]
-debug = false
-
-[profile.release-line-debug]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-[profile.release-line-debug-lto]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-lto = true
-
-[profile.release-line-debug-size]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-opt-level = "s"
-[profile.release-line-debug-zize]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-opt-level = "z"
-[profile.release-line-debug-size-lto]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-opt-level = "s"
-lto = true
-[profile.release-line-debug-zize-lto]
-inherits = "release"
-debug = 1 # true = 2 = all symbols, 1 = line only
-opt-level = "z"
-lto = true
-
-[profile.release-no-debug]
-inherits = "release"
-debug = false # true = 2 = all symbols, 1 = line only
-
-[profile.release-no-debug-size]
-inherits = "release"
-debug = false # true = 2 = all symbols, 1 = line only
-opt-level = "s"
-[profile.release-no-debug-zize]
-inherits = "release"
-debug = false # true = 2 = all symbols, 1 = line only
-opt-level = "z"
-
-[profile.release-no-debug-size-lto]
-inherits = "release"
-debug = false # true = 2 = all symbols, 1 = line only
-opt-level = "s"
-lto = true
-
-[profile.release-no-debug-zize-lto]
-inherits = "release"
-debug = false # true = 2 = all symbols, 1 = line only
-opt-level = "z"
-lto = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -278,8 +278,8 @@ fn main() -> Result<()> {
            use tokio_util::sync::CancellationToken;
            use tracing::warn;
            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
-            let cgroup = matches.get_one::<String>("filecache-connstr");
-            let file_cache_connstr = matches.get_one::<String>("cgroup");
+            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
+            let cgroup = matches.get_one::<String>("cgroup");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -68,8 +68,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-pub mod sync;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +0,0 @@
-pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,306 +0,0 @@
-use std::sync::{Arc, Mutex, MutexGuard};
-use tokio::sync::Semaphore;
-
-/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
-/// for the duration of initialization.
-///
-/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
-///
-/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
-pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
-}
-
-impl<T> Default for OnceCell<T> {
-    /// Create new uninitialized [`OnceCell`].
-    fn default() -> Self {
-        Self {
-            inner: Default::default(),
-        }
-    }
-}
-
-/// Semaphore is the current state:
-/// - open semaphore means the value is `None`, not yet initialized
-/// - closed semaphore means the value has been initialized
-#[derive(Debug)]
-struct Inner<T> {
-    init_semaphore: Arc<Semaphore>,
-    value: Option<T>,
-}
-
-impl<T> Default for Inner<T> {
-    fn default() -> Self {
-        Self {
-            init_semaphore: Arc::new(Semaphore::new(1)),
-            value: None,
-        }
-    }
-}
-
-impl<T> OnceCell<T> {
-    /// Creates an already initialized `OnceCell` with the given value.
-    pub fn new(value: T) -> Self {
-        let sem = Semaphore::new(1);
-        sem.close();
-        Self {
-            inner: Mutex::new(Inner {
-                init_semaphore: Arc::new(sem),
-                value: Some(value),
-            }),
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
-    where
-        F: FnOnce() -> Fut,
-        Fut: std::future::Future<Output = Result<T, E>>,
-    {
-        let sem = {
-            let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = sem.acquire_owned().await;
-        if permit.is_err() {
-            let guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_some(),
-                "semaphore got closed, must be initialized"
-            );
-            return Ok(Guard(guard));
-        } else {
-            // now we try
-            let value = factory().await?;
-
-            let mut guard = self.inner.lock().unwrap();
-            assert!(
-                guard.value.is_none(),
-                "we won permit, must not be initialized"
-            );
-            guard.value = Some(value);
-            guard.init_semaphore.close();
-            Ok(Guard(guard))
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
-        if guard.value.is_some() {
-            Some(Guard(guard))
-        } else {
-            None
-        }
-    }
-}
-
-/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
-/// initialized value.
-#[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for Guard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> std::ops::DerefMut for Guard<'_, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-            .value
-            .as_mut()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<'a, T> Guard<'a, T> {
-    /// Take the current value, and a new permit for it's deinitialization.
-    ///
-    /// The permit will be on a semaphore part of the new internal value, and any following
-    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
-        let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, permit))
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::{
-        convert::Infallible,
-        sync::atomic::{AtomicUsize, Ordering},
-        time::Duration,
-    };
-
-    #[tokio::test]
-    async fn many_initializers() {
-        #[derive(Default, Debug)]
-        struct Counters {
-            factory_got_to_run: AtomicUsize,
-            future_polled: AtomicUsize,
-            winners: AtomicUsize,
-        }
-
-        let initializers = 100;
-
-        let cell = Arc::new(OnceCell::default());
-        let counters = Arc::new(Counters::default());
-        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
-
-        let mut js = tokio::task::JoinSet::new();
-        for i in 0..initializers {
-            js.spawn({
-                let cell = cell.clone();
-                let counters = counters.clone();
-                let barrier = barrier.clone();
-
-                async move {
-                    barrier.wait().await;
-                    let won = {
-                        let g = cell
-                            .get_or_init(|| {
-                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
-                                async {
-                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>(i)
-                                }
-                            })
-                            .await
-                            .unwrap();
-
-                        *g == i
-                    };
-
-                    if won {
-                        counters.winners.fetch_add(1, Ordering::Relaxed);
-                    }
-                }
-            });
-        }
-
-        barrier.wait().await;
-
-        while let Some(next) = js.join_next().await {
-            next.expect("no panics expected");
-        }
-
-        let mut counters = Arc::try_unwrap(counters).unwrap();
-
-        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
-        assert_eq!(*counters.future_polled.get_mut(), 1);
-        assert_eq!(*counters.winners.get_mut(), 1);
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reinit_waits_for_deinit() {
-        // with he tokio::time paused, we will "sleep" for 1s while holding the reinitialization
-        let sleep_for = Duration::from_secs(1);
-        let initial = 42;
-        let reinit = 1;
-        let cell = Arc::new(OnceCell::new(initial));
-
-        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
-
-        let jh = tokio::spawn({
-            let cell = cell.clone();
-            let deinitialization_started = deinitialization_started.clone();
-            async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
-                assert_eq!(answer, initial);
-
-                deinitialization_started.wait().await;
-                tokio::time::sleep(sleep_for).await;
-            }
-        });
-
-        deinitialization_started.wait().await;
-
-        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
-            .await
-            .unwrap();
-
-        let elapsed = started_at.elapsed();
-        assert!(
-            elapsed >= sleep_for,
-            "initialization should had taken at least the time time slept with permit"
-        );
-
-        jh.await.unwrap();
-
-        assert_eq!(*cell.get().unwrap(), reinit);
-    }
-
-    #[tokio::test]
-    async fn initialization_attemptable_until_ok() {
-        let cell = OnceCell::default();
-
-        for _ in 0..10 {
-            cell.get_or_init(|| async { Err("whatever error") })
-                .await
-                .unwrap_err();
-        }
-
-        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
-            .await
-            .unwrap();
-        assert_eq!(*g, "finally success");
-    }
-
-    #[tokio::test]
-    async fn initialization_is_cancellation_safe() {
-        let cell = OnceCell::default();
-
-        let barrier = tokio::sync::Barrier::new(2);
-
-        let initializer = cell.get_or_init(|| async {
-            barrier.wait().await;
-            futures::future::pending::<()>().await;
-
-            Ok::<_, Infallible>("never reached")
-        });
-
-        tokio::select! {
-            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
-            _ = barrier.wait() => {}
-        };
-
-        // now initializer is dropped
-
-        assert!(cell.get().is_none());
-
-        let g = cell
-            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
-            .await
-            .unwrap();
-        assert_eq!(*g, "now initialized");
-    }
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -80,6 +80,8 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+#tokio-epoll-uring = { path = "/home/admin/tokio-epoll-uring/tokio-epoll-uring" }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "problame/hacky-openat" }

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -8,10 +8,9 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, path::Path, str};

-use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
-use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
+use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection, PAGE_SZ};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
 use pageserver::virtual_file::VirtualFile;
@@ -97,7 +96,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -135,10 +134,6 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

-    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10);
-    pageserver::page_cache::init(100);
-
    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
    let mut total_excess_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,7 +5,6 @@ use clap::Subcommand;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
-use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
    tenant::{
@@ -45,10 +44,8 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    let path = path.as_ref();
-    virtual_file::init(10);
-    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0)?;
+    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -12,10 +12,8 @@ use clap::{Parser, Subcommand};
 use layers::LayerCmd;
 use pageserver::{
    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
+   task_mgr::TaskKind,
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
 };
 use postgres_ffi::ControlFileData;
 use std::path::{Path, PathBuf};
@@ -115,9 +113,6 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
 }

 async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,11 +20,10 @@ use metrics::set_build_info_metric;
 use pageserver::{
    config::{defaults::*, PageServerConf},
    context::{DownloadBehavior, RequestContext},
-    http, page_cache, page_service, task_mgr,
+    http, page_service, task_mgr,
    task_mgr::TaskKind,
    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
-    virtual_file,
 };
 use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
@@ -124,10 +123,6 @@ fn main() -> anyhow::Result<()> {
    // Initialize up failpoints support
    let scenario = pageserver::failpoint_support::init();

-    // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
-    page_cache::init(conf.page_cache_size);
-
    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

    scenario.teardown();
@@ -581,6 +576,31 @@ fn start_pageserver(
        );
    }

+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::BackgroundRuntimeTurnaroundMeasure,
+        None,
+        None,
+        "background runtime turnaround measure",
+        true,
+        async move {
+            let server = hyper::Server::try_bind(&"0.0.0.0:2342".parse().unwrap()).expect("bind");
+            let server = server
+                .serve(hyper::service::make_service_fn(|_| async move {
+                    Ok::<_, std::convert::Infallible>(hyper::service::service_fn(
+                        move |_: hyper::Request<hyper::Body>| async move {
+                            Ok::<_, std::convert::Infallible>(hyper::Response::new(
+                                hyper::Body::from(format!("alive")),
+                            ))
+                        },
+                    ))
+                }))
+                .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            server.await?;
+            Ok(())
+        },
+    );
+
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/buffer_pool.rs
+++ b/pageserver/src/buffer_pool.rs
@@ -0,0 +1,39 @@
+use std::cell::RefCell;
+
+use crate::tenant::disk_btree::PAGE_SZ;
+
+pub struct Buffer(Option<Box<[u8; PAGE_SZ]>>);
+
+// Thread-local list of re-usable buffers.
+thread_local! {
+    static POOL: RefCell<Vec<Box<[u8; PAGE_SZ]>>> = RefCell::new(Vec::new());
+}
+
+pub(crate) fn get() -> Buffer {
+    let maybe = POOL.with(|rc| rc.borrow_mut().pop());
+    match maybe {
+        Some(buf) => Buffer(Some(buf)),
+        None => Buffer(Some(Box::new([0; PAGE_SZ]))),
+    }
+}
+
+impl Drop for Buffer {
+    fn drop(&mut self) {
+        let buf = self.0.take().unwrap();
+        POOL.with(|rc| rc.borrow_mut().push(buf))
+    }
+}
+
+impl std::ops::Deref for Buffer {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref().unwrap().as_ref()
+    }
+}
+
+impl std::ops::DerefMut for Buffer {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0.as_mut().unwrap().as_mut()
+    }
+}
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,11 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
-        self,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
-        Timeline,
-    },
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -112,7 +108,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -125,7 +121,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +145,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -181,12 +183,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -270,6 +273,7 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -326,10 +330,9 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -346,15 +349,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
    }

    let usage_planned = match warned {
@@ -371,101 +369,64 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    let mut js = tokio::task::JoinSet::new();
-
-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
-
+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
-
-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+        let batch_size = batch.len();

        debug!(%timeline_id, "evicting batch for timeline");

-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;

-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch, &cancel).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
+                        match result {
+                            Some(Ok(())) => {
+                                usage_assumed.add_available_bytes(file_size);
+                            }
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
                            }
                        }
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
-                    }
                }
-                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-            }
-        }
-        (usage_assumed, evictions_failed)
-    };
-
-    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
-        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+        if cancel.is_cancelled() {
            return Ok(IterationOutcome::Cancelled);
        }
-    };
+    }

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -480,7 +441,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Layer,
+    layer: Arc<dyn PersistentLayer>,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1028,7 +1028,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1053,7 +1053,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
@@ -1160,11 +1160,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };

    let state = state.disk_usage_eviction_state.clone();

@@ -1182,6 +1182,7 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
+                &storage,
                usage,
                &child_cancel,
            )
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
 pub mod metrics;
-pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
@@ -28,6 +27,8 @@ use std::path::Path;
 use crate::task_mgr::TaskKind;
 use tracing::info;

+pub mod buffer_pool;
+
 /// Current storage format version
 ///
 /// This is embedded in the header of all the layer files.
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -1,869 +0,0 @@
-//!
-//! Global page cache
-//!
-//! The page cache uses up most of the memory in the page server. It is shared
-//! by all tenants, and it is used to store different kinds of pages. Sharing
-//! the cache allows memory to be dynamically allocated where it's needed the
-//! most.
-//!
-//! The page cache consists of fixed-size buffers, 8 kB each to match the
-//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
-//! information about what's stored in the buffer.
-//!
-//! # Types Of Pages
-//!
-//! [`PageCache`] only supports immutable pages.
-//! Hence there is no need to worry about coherency.
-//!
-//! Two types of pages are supported:
-//!
-//! * **Materialized pages**, filled & used by page reconstruction
-//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
-//!
-//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
-//! It uses the page cache only for the blocks that are already fully written and immutable.
-//!
-//! # Filling The Page Cache
-//!
-//! Page cache maps from a cache key to a buffer slot.
-//! The cache key uniquely identifies the piece of data that is being cached.
-//!
-//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
-//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
-//!
-//! The cache key for **immutable file** pages is [`FileId`] and a block number.
-//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
-//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
-//! * Get a [`FileId`] using [`next_file_id`].
-//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
-//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
-//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
-//!   a read guard for the page. Just use it.
-//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
-//!   a write guard for the page. Fill the page with the contents of the on-disk file.
-//!   Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
-//!   Then try again to [`PageCache::read_immutable_buf`].
-//!   Unless there's high cache pressure, the page should now be cached.
-//!   (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
-//!
-//! # Locking
-//!
-//! There are two levels of locking involved: There's one lock for the "mapping"
-//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
-//! slot, and a separate lock on each slot. To read or write the contents of a
-//! slot, you must hold the lock on the slot in read or write mode,
-//! respectively. To change the mapping of a slot, i.e. to evict a page or to
-//! assign a buffer for a page, you must hold the mapping lock and the lock on
-//! the slot at the same time.
-//!
-//! Whenever you need to hold both locks simultaneously, the slot lock must be
-//! acquired first. This consistent ordering avoids deadlocks. To look up a page
-//! in the cache, you would first look up the mapping, while holding the mapping
-//! lock, and then lock the slot. You must release the mapping lock in between,
-//! to obey the lock ordering and avoid deadlock.
-//!
-//! A slot can momentarily have invalid contents, even if it's already been
-//! inserted to the mapping, but you must hold the write-lock on the slot until
-//! the contents are valid. If you need to release the lock without initializing
-//! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
-//! initialized it. If the guard is dropped without calling mark_valid(), the
-//! mapping is automatically removed and the slot is marked free.
-//!
-
-use std::{
-    collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
-    sync::{
-        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
-        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
-    },
-};
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use crate::{metrics::PageCacheSizeMetrics, repository::Key};
-
-static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
-const TEST_PAGE_CACHE_SIZE: usize = 50;
-
-///
-/// Initialize the page cache. This must be called once at page server startup.
-///
-pub fn init(size: usize) {
-    if PAGE_CACHE.set(PageCache::new(size)).is_err() {
-        panic!("page cache already initialized");
-    }
-}
-
-///
-/// Get a handle to the page cache.
-///
-pub fn get() -> &'static PageCache {
-    //
-    // In unit tests, page server startup doesn't happen and no one calls
-    // page_cache::init(). Initialize it here with a tiny cache, so that the
-    // page cache is usable in unit tests.
-    //
-    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
-    } else {
-        PAGE_CACHE.get().expect("page cache not initialized")
-    }
-}
-
-pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
-const MAX_USAGE_COUNT: u8 = 5;
-
-/// See module-level comment.
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct FileId(u64);
-
-static NEXT_ID: AtomicU64 = AtomicU64::new(1);
-
-/// See module-level comment.
-pub fn next_file_id() -> FileId {
-    FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
-}
-
-///
-/// CacheKey uniquely identifies a "thing" to cache in the page cache.
-///
-#[derive(Debug, PartialEq, Eq, Clone)]
-#[allow(clippy::enum_variant_names)]
-enum CacheKey {
-    MaterializedPage {
-        hash_key: MaterializedPageHashKey,
-        lsn: Lsn,
-    },
-    ImmutableFilePage {
-        file_id: FileId,
-        blkno: u32,
-    },
-}
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-struct MaterializedPageHashKey {
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    key: Key,
-}
-
-#[derive(Clone)]
-struct Version {
-    lsn: Lsn,
-    slot_idx: usize,
-}
-
-struct Slot {
-    inner: RwLock<SlotInner>,
-    usage_count: AtomicU8,
-}
-
-struct SlotInner {
-    key: Option<CacheKey>,
-    buf: &'static mut [u8; PAGE_SZ],
-}
-
-impl Slot {
-    /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
-    fn inc_usage_count(&self) {
-        let _ = self
-            .usage_count
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
-                if val == MAX_USAGE_COUNT {
-                    None
-                } else {
-                    Some(val + 1)
-                }
-            });
-    }
-
-    /// Decrement usage count on the buffer, unless it's already zero.  Returns
-    /// the old usage count.
-    fn dec_usage_count(&self) -> u8 {
-        let count_res =
-            self.usage_count
-                .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
-                    if val == 0 {
-                        None
-                    } else {
-                        Some(val - 1)
-                    }
-                });
-
-        match count_res {
-            Ok(usage_count) => usage_count,
-            Err(usage_count) => usage_count,
-        }
-    }
-}
-
-pub struct PageCache {
-    /// This contains the mapping from the cache key to buffer slot that currently
-    /// contains the page, if any.
-    ///
-    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
-    /// this HashMap can be replaced with a more concurrent version, there are
-    /// plenty of such crates around.
-    ///
-    /// If you add support for caching different kinds of objects, each object kind
-    /// can have a separate mapping map, next to this field.
-    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
-
-    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,
-
-    /// The actual buffers with their metadata.
-    slots: Box<[Slot]>,
-
-    /// Index of the next candidate to evict, for the Clock replacement algorithm.
-    /// This is interpreted modulo the page cache size.
-    next_evict_slot: AtomicUsize,
-
-    size_metrics: &'static PageCacheSizeMetrics,
-}
-
-///
-/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
-/// until the guard is dropped.
-///
-pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
-
-impl std::ops::Deref for PageReadGuard<'_> {
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &Self::Target {
-        self.0.buf
-    }
-}
-
-impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
-    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.0.buf
-    }
-}
-
-///
-/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
-/// until the guard is dropped.
-///
-/// Counterintuitively, this is used even for a read, if the requested page is not
-/// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
-pub struct PageWriteGuard<'i> {
-    inner: RwLockWriteGuard<'i, SlotInner>,
-
-    // Are the page contents currently valid?
-    valid: bool,
-}
-
-impl std::ops::DerefMut for PageWriteGuard<'_> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
-    }
-}
-
-impl std::ops::Deref for PageWriteGuard<'_> {
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &Self::Target {
-        self.inner.buf
-    }
-}
-
-impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
-    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        self.inner.buf
-    }
-}
-
-impl PageWriteGuard<'_> {
-    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
-    }
-}
-
-impl Drop for PageWriteGuard<'_> {
-    ///
-    /// If the buffer was allocated for a page that was not already in the
-    /// cache, but the lock_for_read/write() caller dropped the buffer without
-    /// initializing it, remove the mapping from the page cache.
-    ///
-    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
-        }
-    }
-}
-
-/// lock_for_read() return value
-pub enum ReadBufResult<'a> {
-    Found(PageReadGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
-impl PageCache {
-    //
-    // Section 1.1: Public interface functions for looking up and memorizing materialized page
-    // versions in the page cache
-    //
-
-    /// Look up a materialized page version.
-    ///
-    /// The 'lsn' is an upper bound, this will return the latest version of
-    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
-    /// returned page.
-    pub fn lookup_materialized_page(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key: &Key,
-        lsn: Lsn,
-    ) -> Option<(Lsn, PageReadGuard)> {
-        crate::metrics::PAGE_CACHE
-            .read_accesses_materialized_page
-            .inc();
-
-        let mut cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_id,
-                timeline_id,
-                key: *key,
-            },
-            lsn,
-        };
-
-        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_exact
-                        .inc();
-                } else {
-                    crate::metrics::PAGE_CACHE
-                        .read_hits_materialized_page_older_lsn
-                        .inc();
-                }
-                Some((available_lsn, guard))
-            } else {
-                panic!("unexpected key type in slot");
-            }
-        } else {
-            None
-        }
-    }
-
-    ///
-    /// Store an image of the given page in the cache.
-    ///
-    pub fn memorize_materialized_page(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key: Key,
-        lsn: Lsn,
-        img: &[u8],
-    ) -> anyhow::Result<()> {
-        let cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_id,
-                timeline_id,
-                key,
-            },
-            lsn,
-        };
-
-        match self.lock_for_write(&cache_key)? {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
-            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
-
-        Ok(())
-    }
-
-    // Section 1.2: Public interface functions for working with immutable file pages.
-
-    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
-                        if *file_id == drop_file_id =>
-                    {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                    }
-                    _ => {}
-                }
-            }
-        }
-    }
-
-    //
-    // Section 2: Internal interface functions for lookup/update.
-    //
-    // To add support for a new kind of "thing" to cache, you will need
-    // to add public interface routines above, and code to deal with the
-    // "mappings" after this section. But the routines in this section should
-    // not require changes.
-
-    /// Look up a page in the cache.
-    ///
-    /// If the search criteria is not exact, *cache_key is updated with the key
-    /// for exact key of the returned page. (For materialized pages, that means
-    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
-    /// version.)
-    ///
-    /// If no page is found, returns None and *cache_key is left unmodified.
-    ///
-    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
-        let cache_key_orig = cache_key.clone();
-        if let Some(slot_idx) = self.search_mapping(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().unwrap();
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
-            } else {
-                // search_mapping might have modified the search key; restore it.
-                *cache_key = cache_key_orig;
-            }
-        }
-        None
-    }
-
-    /// Return a locked buffer for given block.
-    ///
-    /// Like try_lock_for_read(), if the search criteria is not exact and the
-    /// page is already found in the cache, *cache_key is updated.
-    ///
-    /// If the page is not found in the cache, this allocates a new buffer for
-    /// it. The caller may then initialize the buffer with the contents, and
-    /// call mark_valid().
-    ///
-    /// Example usage:
-    ///
-    /// ```ignore
-    /// let cache = page_cache::get();
-    ///
-    /// match cache.lock_for_read(&key) {
-    ///     ReadBufResult::Found(read_guard) => {
-    ///         // The page was found in cache. Use it
-    ///     },
-    ///     ReadBufResult::NotFound(write_guard) => {
-    ///         // The page was not found in cache. Read it from disk into the
-    ///         // buffer.
-    ///         //read_my_page_from_disk(write_guard);
-    ///
-    ///         // The buffer contents are now valid. Tell the page cache.
-    ///         write_guard.mark_valid();
-    ///     },
-    /// }
-    /// ```
-    ///
-    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
-        let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
-            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.read_hits_immutable,
-            ),
-        };
-        read_access.inc();
-
-        let mut is_first_iteration = true;
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                if is_first_iteration {
-                    hit.inc();
-                }
-                return Ok(ReadBufResult::Found(read_guard));
-            }
-            is_first_iteration = false;
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.usage_count.store(1, Ordering::Relaxed);
-
-            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().unwrap();
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
-                return Ok(WriteBufResult::Found(write_guard));
-            }
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.usage_count.store(1, Ordering::Relaxed);
-
-            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    //
-    // Section 3: Mapping functions
-    //
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any. If the search criteria is not exact,
-    /// *cache_key is updated with the actual key of the found page.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns.  The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Ok(version_idx) => version_idx,
-                    Err(0) => return None,
-                    Err(version_idx) => version_idx - 1,
-                };
-                let version = &versions[version_idx];
-                *lsn = version.lsn;
-                Some(version.slot_idx)
-            }
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = self.immutable_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Like 'search_mapping, but performs an "exact" search. Used for
-    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
-        match key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Some(versions[version_idx].slot_idx)
-                } else {
-                    None
-                }
-            }
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = self.immutable_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::MaterializedPage {
-                hash_key: old_hash_key,
-                lsn: old_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
-                    let versions = old_entry.get_mut();
-
-                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
-                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
-                        if versions.is_empty() {
-                            old_entry.remove_entry();
-                        }
-                    }
-                } else {
-                    panic!("could not find old key in mapping")
-                }
-            }
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let mut map = self.immutable_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::MaterializedPage {
-                hash_key: new_key,
-                lsn: new_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                let versions = map.entry(new_key.clone()).or_default();
-                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
-                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
-                    Err(version_idx) => {
-                        versions.insert(
-                            version_idx,
-                            Version {
-                                lsn: *new_lsn,
-                                slot_idx,
-                            },
-                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
-                        None
-                    }
-                }
-            }
-
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let mut map = self.immutable_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
-    //
-    // Section 4: Misc internal helpers
-    //
-
-    /// Find a slot to evict.
-    ///
-    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
-        let iter_limit = self.slots.len() * 10;
-        let mut iters = 0;
-        loop {
-            iters += 1;
-            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
-
-            let slot = &self.slots[slot_idx];
-
-            if slot.dec_usage_count() == 0 {
-                let mut inner = match slot.inner.try_write() {
-                    Ok(inner) => inner,
-                    Err(TryLockError::Poisoned(err)) => {
-                        anyhow::bail!("buffer lock was poisoned: {err:?}")
-                    }
-                    Err(TryLockError::WouldBlock) => {
-                        // If we have looped through the whole buffer pool 10 times
-                        // and still haven't found a victim buffer, something's wrong.
-                        // Maybe all the buffers were in locked. That could happen in
-                        // theory, if you have more threads holding buffers locked than
-                        // there are buffers in the pool. In practice, with a reasonably
-                        // large buffer pool it really shouldn't happen.
-                        if iters > iter_limit {
-                            anyhow::bail!("exceeded evict iter limit");
-                        }
-                        continue;
-                    }
-                };
-                if let Some(old_key) = &inner.key {
-                    // remove mapping for old buffer
-                    self.remove_mapping(old_key);
-                    inner.key = None;
-                }
-                return Ok((slot_idx, inner));
-            }
-        }
-    }
-
-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
-
-        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
-        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
-
-                Slot {
-                    inner: RwLock::new(SlotInner { key: None, buf }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            materialized_page_map: Default::default(),
-            immutable_page_map: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-            size_metrics,
-        }
-    }
-}
-
-trait PageSzBytesMetric {
-    fn set_page_sz(&self, count: usize);
-    fn add_page_sz(&self, count: usize);
-    fn sub_page_sz(&self, count: usize);
-}
-
-#[inline(always)]
-fn count_times_page_sz(count: usize) -> u64 {
-    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
-}
-
-impl PageSzBytesMetric for metrics::UIntGauge {
-    fn set_page_sz(&self, count: usize) {
-        self.set(count_times_page_sz(count));
-    }
-    fn add_page_sz(&self, count: usize) {
-        self.add(count_times_page_sz(count));
-    }
-    fn sub_page_sz(&self, count: usize) {
-        self.sub(count_times_page_sz(count));
-    }
-}
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -292,6 +292,8 @@ pub enum TaskKind {

    DebugTool,

+    BackgroundRuntimeTurnaroundMeasure,
+
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -133,7 +133,9 @@ pub(crate) mod timeline;
 pub mod size;

 pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
+pub use timeline::{
+    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
+};

 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;
@@ -4039,7 +4041,6 @@ mod tests {

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
-        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -4047,18 +4048,16 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;

        let layer_map = tline.layers.read().await;
-        let level0_deltas = layer_map
-            .layer_map()
-            .get_level0_deltas()?
-            .into_iter()
-            .map(|desc| layer_map.get_from_desc(&desc))
-            .collect::<Vec<_>>();
+        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
+            let delta = layer_map.get_from_desc(&delta);
            // Ensure we are dumping a delta layer here
-            assert!(delta.layer_desc().is_delta);
+            let delta = delta.downcast_delta_layer().unwrap();
+
+            delta.dump(false, &ctx).await.unwrap();
            delta.dump(true, &ctx).await.unwrap();
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,11 +11,12 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
-use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

+use super::disk_btree::PAGE_SZ;
+
 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
@@ -33,7 +34,7 @@ impl<'a> BlockCursor<'a> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum)?;
+        let mut buf = self.read_blk(blknum).await?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -49,7 +50,7 @@ impl<'a> BlockCursor<'a> {
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum)?;
+                buf = self.read_blk(blknum).await?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -70,7 +71,7 @@ impl<'a> BlockCursor<'a> {
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum)?;
+                buf = self.read_blk(blknum).await?;
                off = 0;
                page_remain = PAGE_SZ;
            }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -4,7 +4,7 @@

 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::tenant::disk_btree::PAGE_SZ;
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::fs::File;
@@ -36,22 +36,22 @@ where

 /// Reference to an in-memory copy of an immutable on-disk block.
 pub enum BlockLease<'a> {
-    PageReadGuard(PageReadGuard<'static>),
+    PageReadGuard(crate::buffer_pool::Buffer),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
-    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
 }

-impl From<PageReadGuard<'static>> for BlockLease<'static> {
-    fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
+impl From<crate::buffer_pool::Buffer> for BlockLease<'static> {
+    fn from(value: crate::buffer_pool::Buffer) -> BlockLease<'static> {
        BlockLease::PageReadGuard(value)
    }
 }

 #[cfg(test)]
-impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
-    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
-        BlockLease::Rc(value)
+impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Arc(value)
    }
 }

@@ -63,7 +63,7 @@ impl<'a> Deref for BlockLease<'a> {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
            #[cfg(test)]
-            BlockLease::Rc(v) => v.deref(),
+            BlockLease::Arc(v) => v.deref(),
        }
    }
 }
@@ -74,7 +74,7 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
-    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    // FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
@@ -83,13 +83,13 @@ pub(crate) enum BlockReaderRef<'a> {

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReaderVirtual(r) => r.read_blk(blknum),
-            FileBlockReaderFile(r) => r.read_blk(blknum),
-            EphemeralFile(r) => r.read_blk(blknum),
-            Adapter(r) => r.read_blk(blknum),
+            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
+            // FileBlockReaderFile(r) => r.read_blk(blknum).await,
+            EphemeralFile(r) => r.read_blk(blknum).await,
+            Adapter(r) => r.read_blk(blknum).await,
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
        }
@@ -134,8 +134,8 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum)
+    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum).await
    }
 }

@@ -145,61 +145,51 @@ impl<'a> BlockCursor<'a> {
 /// for modifying the file, nor for invalidating the cache if it is modified.
 pub struct FileBlockReader<F> {
    pub file: F,
-
-    /// Unique ID of this file, used as key in the page cache.
-    file_id: page_cache::FileId,
 }

-impl<F> FileBlockReader<F>
-where
-    F: FileExt,
-{
+impl<F> FileBlockReader<F> {
    pub fn new(file: F) -> Self {
-        let file_id = page_cache::next_file_id();
-
-        FileBlockReader { file_id, file }
+        FileBlockReader { file }
    }
+}

-    /// Read a page from the underlying file into given buffer.
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
-        assert!(buf.len() == PAGE_SZ);
-        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
-    }
-    /// Read a block.
-    ///
-    /// Returns a "lease" object that can be used to
-    /// access to the contents of the page. (For the page cache, the
-    /// lease object represents a lock on the buffer.)
-    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_immutable_buf(self.file_id, blknum)
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        format!("Failed to read immutable buf: {e:#}"),
-                    )
-                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+macro_rules! impls {
+    (FileBlockReader<$ty:ty>) => {
+        impl FileBlockReader<$ty> {
+            /// Read a page from the underlying file into given buffer.
+            async fn fill_buffer(
+                &self,
+                buf: crate::buffer_pool::Buffer,
+                blkno: u32,
+            ) -> Result<crate::buffer_pool::Buffer, std::io::Error> {
+                assert!(buf.len() == PAGE_SZ);
+                self.file
+                    .read_exact_at_async(buf, blkno as u64 * PAGE_SZ as u64)
+                    .await
+            }
+            /// Read a block.
+            ///
+            /// Returns a "lease" object that can be used to
+            /// access to the contents of the page. (For the page cache, the
+            /// lease object represents a lock on the buffer.)
+            pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+                let buf = crate::buffer_pool::get();
+                // Read the page from disk into the buffer
+                let mut write_guard = self.fill_buffer(buf, blknum).await?;
+                Ok(BlockLease::PageReadGuard(write_guard))
+            }
        }
-    }
+    };
 }

-impl BlockReader for FileBlockReader<File> {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
-    }
-}
+// impls!(FileBlockReader<File>);
+impls!(FileBlockReader<VirtualFile>);
+
+// impl BlockReader for FileBlockReader<File> {
+//     fn block_cursor(&self) -> BlockCursor<'_> {
+//         BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
+//     }
+// }

 impl BlockReader for FileBlockReader<VirtualFile> {
    fn block_cursor(&self) -> BlockCursor<'_> {
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where
        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -357,7 +357,7 @@ where
        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -704,7 +704,7 @@ pub(crate) mod tests {
        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf).into())
+            Ok(std::sync::Arc::new(buf).into())
        }
    }
    impl BlockReader for TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,22 +2,19 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
+use crate::tenant::disk_btree::PAGE_SZ;
 use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

 pub struct EphemeralFile {
-    page_cache_file_id: page_cache::FileId,
-
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: VirtualFile,
@@ -48,7 +45,6 @@ impl EphemeralFile {
        )?;

        Ok(EphemeralFile {
-            page_cache_file_id: page_cache::next_file_id(),
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
            file,
@@ -61,40 +57,17 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
-            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
+            let mut write_guard: crate::buffer_pool::Buffer = crate::buffer_pool::get();
+            let buf: &mut [u8] = write_guard.deref_mut();
+            debug_assert_eq!(buf.len(), PAGE_SZ);
+            let mut buf = self
+                .file
+                .read_exact_at_async(write_guard, blknum as u64 * PAGE_SZ as u64)
+                .await?;
+            Ok(BlockLease::PageReadGuard(buf))
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -132,29 +105,6 @@ impl EphemeralFile {
                            self.blknum as u64 * PAGE_SZ as u64,
                        ) {
                            Ok(_) => {
-                                // Pre-warm the page cache with what we just wrote.
-                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
-                                let cache = page_cache::get();
-                                match cache.read_immutable_buf(
-                                    self.ephemeral_file.page_cache_file_id,
-                                    self.blknum,
-                                ) {
-                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
-                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
-                                    }
-                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
-                                        let buf: &mut [u8] = write_guard.deref_mut();
-                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        write_guard.mark_valid();
-                                        // pre-warm successful
-                                    }
-                                    Err(e) => {
-                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                                    }
-                                }
                                // Zero the buffer for re-use.
                                // Zeroing is critical for correcntess because the write_blob code below
                                // and similarly read_blk expect zeroed pages.
@@ -221,9 +171,8 @@ pub fn is_ephemeral_file(filename: &str) -> bool {

 impl Drop for EphemeralFile {
    fn drop(&mut self) {
-        // drop all pages from page cache
-        let cache = page_cache::get();
-        cache.drop_buffers_for_immutable(self.page_cache_file_id);
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,10 +639,147 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for desc in self.iter_historic_layers() {
-            desc.dump();
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose, ctx)?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::LayerMap;
+    use crate::tenant::storage_layer::LayerFileName;
+    use std::str::FromStr;
+    use std::sync::Arc;
+
+    mod l0_delta_layers_updated {
+
+        use crate::tenant::{
+            storage_layer::{AsLayerDesc, PersistentLayerDesc},
+            timeline::layer_manager::LayerFileManager,
+        };
+
+        use super::*;
+
+        struct LayerObject(PersistentLayerDesc);
+
+        impl AsLayerDesc for LayerObject {
+            fn layer_desc(&self) -> &PersistentLayerDesc {
+                &self.0
+            }
+        }
+
+        impl LayerObject {
+            fn new(desc: PersistentLayerDesc) -> Self {
+                LayerObject(desc)
+            }
+        }
+
+        type TestLayerFileManager = LayerFileManager<LayerObject>;
+
+        #[test]
+        fn for_full_range_delta() {
+            // l0_delta_layers are used by compaction, and should observe all buffered updates
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
+        }
+
+        #[test]
+        fn for_non_full_range_delta() {
+            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
+        }
+
+        #[test]
+        fn for_image() {
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
+        }
+
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = PersistentLayerDesc::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found = Arc::new(LayerObject::new(layer.clone()));
+            let new_version = Arc::new(LayerObject::new(layer));
+
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();
+
+            let mut mapping = TestLayerFileManager::new();
+
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
+        }
+
+        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
+            let name = LayerFileName::from_str(layer_name).unwrap();
+            let skeleton = PersistentLayerDesc::from(name);
+
+            let remote = Arc::new(LayerObject::new(skeleton.clone()));
+            let downloaded = Arc::new(LayerObject::new(skeleton));
+
+            let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();
+
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+
+            let expected_in_counts = (1, usize::from(expected_l0));
+
+            map.batch_update()
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
+            );
+
+            map.batch_update().remove_historic(downloaded.layer_desc());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+        }
+
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+            let historic = map
+                .iter_historic_layers()
+                .filter(|x| x.key() == layer.key())
+                .count();
+            let l0s = map
+                .get_level0_deltas()
+                .expect("why does this return a result");
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+
+            (historic, l0)
+        }
+    }
+}
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -26,7 +26,7 @@
 //! recovered from this file. This is tracked in
 //! <https://github.com/neondatabase/neon/issues/4418>

-use std::io::{self, Read, Write};
+use std::io::{self, Write};

 use crate::virtual_file::VirtualFile;
 use anyhow::Result;
@@ -151,11 +151,12 @@ impl Manifest {
    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
    /// backup the current one.
-    pub fn load(
-        mut file: VirtualFile,
+    pub async fn load(
+        file: VirtualFile,
    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
        let mut buf = vec![];
-        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
+        file.read_exact_at(&mut buf, 0)
+            .map_err(ManifestLoadError::Io)?;

        // Read manifest header
        let mut buf = Bytes::from(buf);
@@ -241,8 +242,8 @@ mod tests {

    use super::*;

-    #[test]
-    fn test_read_manifest() {
+    #[tokio::test]
+    async fn test_read_manifest() {
        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
        std::fs::create_dir_all(&testdir).unwrap();
        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
@@ -274,7 +275,7 @@ mod tests {
                .truncate(false),
        )
        .unwrap();
-        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        let (mut manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 2);
        assert_eq!(
@@ -306,7 +307,7 @@ mod tests {
                .truncate(false),
        )
        .unwrap();
-        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        let (_manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
        assert!(!corrupted.0);
        assert_eq!(operations.len(), 3);
        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -163,6 +163,8 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
+//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -231,8 +233,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-pub(crate) use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::{
    config::PageServerConf,
@@ -249,7 +250,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::{LayerFileName, ResidentLayer};
+use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -597,25 +598,25 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub(crate) fn schedule_layer_file_upload(
+    pub fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer: ResidentLayer,
+        layer_file_name: &LayerFileName,
+        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let metadata = LayerFileMetadata::new(layer.layer_desc().file_size);
-
        upload_queue
            .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

+        info!("scheduled layer file upload {layer_file_name}");
+
        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
        Ok(())
@@ -1053,8 +1054,11 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path();
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(layer_file_name.file_name());
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
@@ -1363,7 +1367,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1504,7 +1507,7 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline,
+            timeline: _timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
@@ -1524,29 +1527,32 @@ mod tests {
            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
+        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        let content_2 = dummy_contents("bar");
+        let content_3 = dummy_contents("baz");

-        let layers = [
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
-        ]
-        .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
-            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
-
-            Layer::for_resident(
-                harness.conf,
-                &timeline,
-                name,
-                LayerFileMetadata::new(contents.len() as u64),
-            )
-        }).collect::<Vec<_>>();
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }

        client
-            .schedule_layer_file_upload(layers[0].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
            .unwrap();
        client
-            .schedule_layer_file_upload(layers[1].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64),
+            )
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1599,18 +1605,21 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(layers[2].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64),
+            )
            .unwrap();
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -1625,8 +1634,8 @@ mod tests {
        }
        assert_remote_files(
            &[
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1637,8 +1646,8 @@ mod tests {

        assert_remote_files(
            &[
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
+                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1652,7 +1661,7 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline,
+            timeline: _timeline,
            client,
            ..
        } = TestSetup::new("metrics").await.unwrap();
@@ -1672,13 +1681,6 @@ mod tests {
        )
        .unwrap();

-        let layer_file_1 = Layer::for_resident(
-            harness.conf,
-            &timeline,
-            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64),
-        );
-
        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1704,7 +1706,10 @@ mod tests {
        let init = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(layer_file_1.clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
            .unwrap();

        let pre = get_bytes_started_stopped();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -67,8 +67,6 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
-            //
-            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,21 +4,26 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
-mod layer;
 mod layer_desc;
+mod remote_layer;

+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
+use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::Mutex;
+use std::path::PathBuf;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,8 +39,7 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-
-pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
+pub use remote_layer::RemoteLayer;

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -70,7 +74,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
+/// Return value from Layer::get_page_reconstruct_data
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,6 +179,26 @@ impl LayerAccessStats {
        new
    }

+    /// Creates a clone of `self` and records `new_status` in the clone.
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    pub(crate) fn clone_for_residence_change(
+        &self,
+        new_status: LayerResidenceStatus,
+    ) -> LayerAccessStats {
+        let clone = {
+            let inner = self.0.lock().unwrap();
+            inner.clone()
+        };
+        let new = LayerAccessStats(Mutex::new(clone));
+        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new
+    }
+
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -297,12 +321,95 @@ impl LayerAccessStats {
    }
 }

+/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
+/// required by [`LayerMap`](super::layer_map::LayerMap).
+///
+/// All layers should implement a minimal `std::fmt::Debug` without tenant or
+/// timeline names, because those are known in the context of which the layers
+/// are used in (timeline).
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+    ///
+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// See PageReconstructResult for possible return values. The collected data
+    /// is appended to reconstruct_data; the caller should pass an empty struct
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns ValueReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;
+}
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
+///
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN.
+pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// File name used for this layer, both in the pageserver's local filesystem
+    /// state as well as in the remote storage.
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }
+
+    // Path to the layer file in the local filesystem.
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<PathBuf>;
+
+    /// Permanently remove this layer from disk.
+    fn delete_resident_layer_file(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
+
+    fn access_stats(&self) -> &LayerAccessStats;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
+}
+
 pub mod tests {
    use super::*;

@@ -340,6 +447,19 @@ pub mod tests {
    }
 }

+/// Helper enum to hold a PageServerConf, or a path
+///
+/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
+/// global config, and paths to layer files are constructed using the tenant/timeline
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
+/// struct for a file on disk, without having a page server running, so that we have no
+/// config. In that case, we use the Path variant to hold the full path to the file on
+/// disk.
+enum PathOrConf {
+    Path(PathBuf),
+    Conf(&'static PageServerConf),
+}
+
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,23 +29,24 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::PAGE_SZ;
+use crate::tenant::disk_btree::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
+use std::io::SeekFrom;
 use std::io::{BufWriter, Write};
-use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -59,7 +60,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -179,12 +183,20 @@ impl DeltaKey {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
+/// DeltaLayer is the in-memory data structure associated with an on-disk delta
+/// file.
+///
+/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct DeltaLayer {
-    path: PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -201,8 +213,6 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

-/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
-/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -212,6 +222,12 @@ pub struct DeltaLayerInner {
    file: FileBlockReader<VirtualFile>,
 }

+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -221,6 +237,19 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
+impl Layer for DeltaLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -234,9 +263,40 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

+impl PersistentLayer for DeltaLayer {
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        Some(self)
+    }
+
+    fn local_path(&self) -> Option<PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
+        println!(
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end,
+            self.desc.file_size,
+        );

        if !verbose {
            return Ok(());
@@ -244,7 +304,119 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        inner.dump().await
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );
+
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = DeltaLayerInner::load_keys(&inner).await?;
+
+        // A subroutine to dump a single blob
+        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    let err: anyhow::Error = err;
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+
+        ensure!(self.desc.key_range.contains(&key));
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
+            .await
+    }
+
+    pub(crate) fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        let access_stats = self.access_stats.as_api_model(reset);
+
+        HistoricLayerInfo::Delta {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start: lsn_range.start,
+            lsn_end: lsn_range.end,
+            remote: false,
+            access_stats,
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        fname: &DeltaFileName,
+    ) -> PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.clone(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(tenant_id, timeline_id)
+                .join(fname.to_string()),
+        }
    }

    fn temp_path_for(
@@ -290,22 +462,52 @@ impl DeltaLayer {
    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None)?;
+        let summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
+        let loaded = DeltaLayerInner::load(&path, summary).await?;

-        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(Arc::new(loaded))
    }

+    /// Create a DeltaLayer struct representing an existing file on disk.
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &DeltaFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> DeltaLayer {
+        DeltaLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+                file_size,
+            ),
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -320,7 +522,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -333,9 +535,29 @@ impl DeltaLayer {
        })
    }

-    /// Path to the layer file
-    fn path(&self) -> PathBuf {
-        self.path.clone()
+    fn layer_name(&self) -> DeltaFileName {
+        self.desc.delta_file_name()
+    }
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            &self.desc.tenant_id,
+            &self.desc.timeline_id,
+            &self.layer_name(),
+        )
+    }
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .await
+            .context("load delta layer keys")?;
+        DeltaLayerInner::load_keys(inner)
+            .await
+            .context("Layer index is corrupted")
    }
 }

@@ -440,7 +662,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -486,21 +708,37 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-
-        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_start..key_end,
-            self.lsn_range.clone(),
-            metadata.len(),
-        );
+        let layer = DeltaLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+                metadata.len(),
+            ),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all()?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            &self.tenant_id,
+            &self.timeline_id,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
-
-        trace!("created delta layer {}", layer.local_path().display());
+        trace!("created delta layer {}", final_path.display());

        Ok(layer)
    }
@@ -583,12 +821,8 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub(crate) fn finish(
-        mut self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline)
+    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end)
    }
 }

@@ -607,12 +841,16 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
-    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
+    pub(super) async fn load(
+        path: &std::path::Path,
+        summary: Option<Summary>,
+    ) -> anyhow::Result<Self> {
+        let file = VirtualFile::open_async(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0)?;
+        let summary_blk = file.read_blk(0).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -620,11 +858,11 @@ impl DeltaLayerInner {
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

@@ -715,14 +953,14 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys(&self) -> Result<Vec<DeltaEntry<'_>>> {
-        let file = &self.file;
+    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
+        this: &T,
+    ) -> Result<Vec<DeltaEntry<'_>>> {
+        let dl = this.as_ref();
+        let file = &dl.file;

-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
+        let tree_reader =
+            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -735,7 +973,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
+                            Adapter(dl),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -759,61 +997,10 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
-
-    pub(super) async fn dump(&self) -> anyhow::Result<()> {
-        println!(
-            "index_start_blk: {}, root {}",
-            self.index_start_blk, self.index_root_blk
-        );
-
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = self.load_keys().await?;
-
-        async fn dump_blob(val: ValueRef<'_>) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -845,13 +1032,7 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum)
-    }
-}
-
-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum).await
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,30 +25,28 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::PAGE_SZ;
+use crate::tenant::disk_btree::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
+use std::io::SeekFrom;
 use std::io::Write;
-use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -59,7 +57,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -117,14 +115,22 @@ impl Summary {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
+///
+/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
-    path: PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<ImageLayerInner>,
 }

@@ -141,8 +147,6 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -163,22 +167,18 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-impl ImageLayerInner {
-    pub(super) async fn dump(&self) -> anyhow::Result<()> {
-        let file = &self.file;
-        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
-
-        tree_reader.dump().await?;
-
-        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
-            .await?;
-
-        Ok(())
+#[async_trait::async_trait]
+impl Layer for ImageLayer {
+    /// Look up given page in the file
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
    }
 }

@@ -195,21 +195,120 @@ impl AsLayerDesc for ImageLayer {
    }
 }

+impl PersistentLayer for ImageLayer {
+    fn local_path(&self) -> Option<PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
 impl ImageLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
+        println!(
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn,
+            self.desc.is_incremental(),
+            self.desc.file_size
+        );

        if !verbose {
            return Ok(());
        }

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let file = &inner.file;
+        let tree_reader =
+            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        inner.dump().await?;
+        tree_reader.dump().await?;
+
+        tree_reader
+            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+                println!("key: {} offset {}", hex::encode(key), value);
+                true
+            })
+            .await?;

        Ok(())
    }

+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.desc.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, reconstruct_state)
+            .await
+            // FIXME: makes no sense to dump paths
+            .with_context(|| format!("read {}", self.path().display()))
+    }
+
+    pub(crate) fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_start = self.layer_desc().image_layer_lsn();
+
+        HistoricLayerInfo::Image {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start,
+            remote: false,
+            access_stats: self.access_stats.as_api_model(reset),
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        fname: &ImageFileName,
+    ) -> PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.to_path_buf(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(&tenant_id, &timeline_id)
+                .join(fname.to_string()),
+        }
+    }
+
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -245,21 +344,53 @@ impl ImageLayer {
    async fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None)?;
+        let expected_summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = self.path.file_name().unwrap().to_str().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(loaded)
    }

+    /// Create an ImageLayer struct representing an existing file on disk
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &ImageFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> ImageLayer {
+        ImageLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                file_size,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
+            lsn: filename.lsn,
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -272,7 +403,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -286,22 +417,32 @@ impl ImageLayer {
        })
    }

+    fn layer_name(&self) -> ImageFileName {
+        self.desc.image_file_name()
+    }
+
    /// Path to the layer file in pageserver workdir.
-    fn path(&self) -> PathBuf {
-        self.path.clone()
+    pub fn path(&self) -> PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
+            &self.layer_name(),
+        )
    }
 }

 impl ImageLayerInner {
-    pub(super) fn load(
+    pub(super) async fn load(
        path: &std::path::Path,
        lsn: Lsn,
        summary: Option<Summary>,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
+        let file = VirtualFile::open_async(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
+        let summary_blk = file.read_blk(0).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -310,11 +451,11 @@ impl ImageLayerInner {
            expected_summary.index_root_blk = actual_summary.index_root_blk;

            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
+                // bail!(
+                //     "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                //     actual_summary,
+                //     expected_summary
+                // );
            }
        }

@@ -443,7 +584,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -485,13 +626,33 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
+        let layer = ImageLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc,
+            lsn: self.lsn,
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all()?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = ImageLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timeline_id,
+            self.tenant_id,
+            &ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn,
+            },
+        );
+        std::fs::rename(self.path, final_path)?;

-        trace!("created image layer {}", layer.local_path().display());
+        trace!("created image layer {}", layer.path().display());

        Ok(layer)
    }
@@ -557,11 +718,8 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub(crate) fn finish(
-        mut self,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline)
+    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish()
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,12 +10,11 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -29,7 +28,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -204,6 +203,20 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
+impl Layer for InMemoryLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+            .await
+    }
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -212,13 +225,17 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
+    ///
    /// Get layer size.
+    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

+    ///
    /// Create a new, empty, in-memory layer
+    ///
    pub fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -296,7 +313,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self, timeline: &Arc<Timeline>) -> Result<ResidentLayer> {
+    pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -335,7 +352,7 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline)?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -5,7 +6,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -99,22 +100,6 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn from_filename(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        filename: LayerFileName,
-        file_size: u64,
-    ) -> Self {
-        match filename {
-            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
-            }
-            LayerFileName::Delta(d) => {
-                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
-            }
-        }
-    }
-
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -188,31 +173,21 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self) {
-        if self.is_delta {
-            println!(
-                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.lsn_range.start,
-                self.lsn_range.end,
-                self.is_incremental(),
-                self.file_size,
-            );
-        } else {
-            println!(
-                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.image_layer_lsn(),
-                self.is_incremental(),
-                self.file_size
-            );
-        }
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental(),
+            self.file_size,
+        );
+
+        Ok(())
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,216 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::timeline::layer_manager::LayerManager;
+use anyhow::{bail, Result};
+use pageserver_api::models::HistoricLayerInfo;
+use std::ops::Range;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName};
+use super::{
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+};
+
+/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
+/// [`DeltaLayer`](super::DeltaLayer).
+///
+/// RemoteLayer might be downloaded on-demand during operations which are
+/// allowed download remote layers and during which, it gets replaced with a
+/// concrete `DeltaLayer` or `ImageLayer`.
+///
+/// See: [`crate::context::RequestContext`] for authorization to download
+pub struct RemoteLayer {
+    pub desc: PersistentLayerDesc,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    access_stats: LayerAccessStats,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    ///
+    /// [`ongoing_download`]: Self::ongoing_download
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RemoteLayer")
+            .field("file_name", &self.desc.filename())
+            .field("layer_metadata", &self.layer_metadata)
+            .field("is_incremental", &self.desc.is_incremental())
+            .finish()
+    }
+}
+
+#[async_trait::async_trait]
+impl Layer for RemoteLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        bail!("layer {self} needs to be downloaded");
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for RemoteLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        if self.desc.is_delta {
+            HistoricLayerInfo::Delta {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                lsn_end: lsn_range.end,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        } else {
+            HistoricLayerInfo::Image {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        }
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub(crate) fn create_downloaded_layer(
+        &self,
+        _layer_map_lock_held_witness: &LayerManager,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        } else {
+            let fname = self.desc.image_file_name();
+            Arc::new(ImageLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,6 +29,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
+        storage_layer::PersistentLayer,
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -193,26 +194,15 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+        let candidates: Vec<Arc<dyn PersistentLayer>> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
+                if hist_layer.is_remote_layer() {
+                    continue;
+                }

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -243,7 +233,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    candidates.push(hist_layer)
                }
            }
            candidates
@@ -262,7 +252,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates, cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -273,7 +263,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for result in results {
+        for (l, result) in candidates.iter().zip(results) {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -281,10 +271,20 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::FileNotFound)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
+                Some(Err(
+                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
+                )) => {
+                    let e = utils::error::report_compact_sources(&e);
+                    warn!(layer = %l, "failed to evict layer: {e}");
+                    stats.not_evictable += 1;
+                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,19 +8,21 @@ use utils::{

 use crate::{
    config::PageServerConf,
+    metrics::TimelineMetrics,
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
-            ResidentLayer,
+            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
+            PersistentLayerDesc, PersistentLayerKey,
        },
+        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+    layer_fmgr: LayerFileManager,
 }

 /// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
@@ -41,7 +43,7 @@ impl LayerManager {
        }
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -53,12 +55,21 @@ impl LayerManager {
        &self.layer_map
    }

+    /// Replace layers in the layer file manager, used in evictions and layer downloads.
+    pub(crate) fn replace_and_verify(
+        &mut self,
+        expected: Arc<dyn PersistentLayer>,
+        new: Arc<dyn PersistentLayer>,
+    ) -> Result<()> {
+        self.layer_fmgr.replace_and_verify(expected, new)
+    }
+
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Layer>,
+        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -153,10 +164,10 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub(crate) fn track_new_image_layers(&mut self, image_layers: &[ResidentLayer]) {
+    pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }
@@ -164,47 +175,46 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<&ResidentLayer>,
+        delta_layer: Option<DeltaLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
    ) {
-        let inmem = self
-            .layer_map
-            .frozen_layers
-            .pop_front()
-            .expect("there must be a inmem layer to flush");
+        let l = self.layer_map.frozen_layers.pop_front();
+        let mut updates = self.layer_map.batch_update();

-        // Only one task may call this function at a time (for this
-        // timeline). If two tasks tried to flush the same frozen
+        // Only one thread may call this function at a time (for this
+        // timeline). If two threads tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
+        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));

-        if let Some(l) = delta_layer {
-            let mut updates = self.layer_map.batch_update();
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            updates.flush();
+        if let Some(delta_layer) = delta_layer {
+            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
        }
+        updates.flush();
    }

    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: Vec<Layer>,
-        compact_to: &[ResidentLayer],
-        duplicates: &[(ResidentLayer, ResidentLayer)],
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: Vec<Arc<dyn PersistentLayer>>,
+        compact_to: Vec<Arc<dyn PersistentLayer>>,
+        metrics: &TimelineMetrics,
    ) -> Result<()> {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in compact_from {
            // NB: the layer file identified by descriptor `l` is guaranteed to be present
            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
            // time, even though we dropped `Timeline::layers` inbetween.
-            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr)?;
-        }
-        for (old, new) in duplicates {
-            self.layer_fmgr.replace(old.as_ref(), new.as_ref().clone());
+            Self::delete_historic_layer(
+                layer_removal_cs.clone(),
+                l,
+                &mut updates,
+                metrics,
+                &mut self.layer_fmgr,
+            )?;
        }
        updates.flush();
        Ok(())
@@ -213,26 +223,28 @@ impl LayerManager {
    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Layer>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Arc<dyn PersistentLayer>>,
+        metrics: &TimelineMetrics,
    ) -> Result<ApplyGcResultGuard> {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs,
+                layer_removal_cs.clone(),
                doomed_layer,
                &mut updates,
+                metrics,
                &mut self.layer_fmgr,
-            )?;
+            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
        }
        Ok(ApplyGcResultGuard(updates))
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Layer,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
+        mapping: &mut LayerFileManager,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
@@ -242,12 +254,17 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Layer,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
+        metrics: &TimelineMetrics,
+        mapping: &mut LayerFileManager,
    ) -> anyhow::Result<()> {
        let desc = layer.layer_desc();
+        if !layer.is_remote_layer() {
+            layer.delete_resident_layer_file()?;
+            metrics.resident_physical_size_gauge.sub(desc.file_size);
+        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -255,21 +272,22 @@ impl LayerManager {
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
-        mapping.remove(&layer);
-        layer.garbage_collect_on_drop();
+        mapping.remove(layer);

        Ok(())
    }

-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
+pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
+    HashMap<PersistentLayerKey, Arc<T>>,
+);

-impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -279,14 +297,14 @@ impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: T) {
+    pub(crate) fn insert(&mut self, layer: Arc<T>) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &T) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -294,7 +312,7 @@ impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: &T) {
+    pub(crate) fn remove(&mut self, layer: Arc<T>) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -304,13 +322,38 @@ impl<T: AsLayerDesc + Clone + PartialEq + std::fmt::Debug> LayerFileManager<T> {
        }
    }

-    pub(crate) fn replace(&mut self, old: &T, new: T) {
-        let key = old.layer_desc().key();
-        assert_eq!(key, new.layer_desc().key());
+    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
+        let key = expected.layer_desc().key();
+        let other = new.layer_desc().key();

-        if let Some(existing) = self.0.get_mut(&key) {
-            assert_eq!(existing, old);
-            *existing = new;
+        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
+        let new_l0 = LayerMap::is_l0(new.layer_desc());
+
+        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
+            "layermap-replace-notfound"
+        ));
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new layer have different keys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
+        );
+
+        if let Some(layer) = self.0.get_mut(&key) {
+            anyhow::ensure!(
+                compare_arced_layers(&expected, layer),
+                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
+                expected = Arc::as_ptr(&expected),
+                new = Arc::as_ptr(layer),
+            );
+            *layer = new;
+            Ok(())
+        } else {
+            anyhow::bail!("layer was not found");
        }
    }
 }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,7 +1,6 @@
 use crate::metrics::RemoteOpFileKind;

 use super::storage_layer::LayerFileName;
-use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -211,7 +210,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(ResidentLayer, LayerFileMetadata),
+    UploadLayer(LayerFileName, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -226,8 +225,13 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(layer, metadata) => {
-                write!(f, "UploadLayer({}, size={:?})", layer, metadata.file_size())
+            UploadOp::UploadLayer(path, metadata) => {
+                write!(
+                    f,
+                    "UploadLayer({}, size={:?})",
+                    path.file_name(),
+                    metadata.file_size()
+                )
            }
            UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
            UploadOp::Delete(delete) => write!(
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,13 +11,15 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
-use once_cell::sync::OnceCell;
+
 use std::fs::{self, File, OpenOptions};
-use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
+use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
+
+use std::os::fd::OwnedFd;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{RwLock, RwLockWriteGuard};
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, Mutex};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -39,7 +41,7 @@ pub struct VirtualFile {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
    /// belongs to a different VirtualFile.
-    handle: RwLock<SlotHandle>,
+    handle: Arc<Mutex<Option<File>>>, // only transiently None

    /// Current file position
    pos: u64,
@@ -51,7 +53,6 @@ pub struct VirtualFile {
    /// opened, in the VirtualFile::create() function, and strip the flag before
    /// storing it here.
    pub path: PathBuf,
-    open_options: OpenOptions,

    // These are strings becase we only use them for metrics, and those expect strings.
    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
@@ -60,118 +61,6 @@ pub struct VirtualFile {
    timeline_id: String,
 }

-#[derive(Debug, PartialEq, Clone, Copy)]
-struct SlotHandle {
-    /// Index into OPEN_FILES.slots
-    index: usize,
-
-    /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
-    /// been recycled and no longer contains the FD for this virtual file.
-    tag: u64,
-}
-
-/// OPEN_FILES is the global array that holds the physical file descriptors that
-/// are currently open. Each slot in the array is protected by a separate lock,
-/// so that different files can be accessed independently. The lock must be held
-/// in write mode to replace the slot with a different file, but a read mode
-/// is enough to operate on the file, whether you're reading or writing to it.
-///
-/// OPEN_FILES starts in uninitialized state, and it's initialized by
-/// the virtual_file::init() function. It must be called exactly once at page
-/// server startup.
-static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
-
-struct OpenFiles {
-    slots: &'static [Slot],
-
-    /// clock arm for the clock algorithm
-    next: AtomicUsize,
-}
-
-struct Slot {
-    inner: RwLock<SlotInner>,
-
-    /// has this file been used since last clock sweep?
-    recently_used: AtomicBool,
-}
-
-struct SlotInner {
-    /// Counter that's incremented every time a different file is stored here.
-    /// To avoid the ABA problem.
-    tag: u64,
-
-    /// the underlying file
-    file: Option<File>,
-}
-
-impl OpenFiles {
-    /// Find a slot to use, evicting an existing file descriptor if needed.
-    ///
-    /// On return, we hold a lock on the slot, and its 'tag' has been updated
-    /// recently_used has been set. It's all ready for reuse.
-    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
-        //
-        // Run the clock algorithm to find a slot to replace.
-        //
-        let num_slots = self.slots.len();
-        let mut retries = 0;
-        let mut slot;
-        let mut slot_guard;
-        let index;
-        loop {
-            let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
-            slot = &self.slots[next];
-
-            // If the recently_used flag on this slot is set, continue the clock
-            // sweep. Otherwise try to use this slot. If we cannot acquire the
-            // lock, also continue the clock sweep.
-            //
-            // We only continue in this manner for a while, though. If we loop
-            // through the array twice without finding a victim, just pick the
-            // next slot and wait until we can reuse it. This way, we avoid
-            // spinning in the extreme case that all the slots are busy with an
-            // I/O operation.
-            if retries < num_slots * 2 {
-                if !slot.recently_used.swap(false, Ordering::Release) {
-                    if let Ok(guard) = slot.inner.try_write() {
-                        slot_guard = guard;
-                        index = next;
-                        break;
-                    }
-                }
-                retries += 1;
-            } else {
-                slot_guard = slot.inner.write().unwrap();
-                index = next;
-                break;
-            }
-        }
-
-        //
-        // We now have the victim slot locked. If it was in use previously, close the
-        // old file.
-        //
-        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
-            // distinguish the two.
-            STORAGE_IO_TIME
-                .with_label_values(&["close-by-replace"])
-                .observe_closure_duration(|| drop(old_file));
-        }
-
-        // Prepare the slot for reuse and return it
-        slot_guard.tag += 1;
-        slot.recently_used.store(true, Ordering::Relaxed);
-        (
-            SlotHandle {
-                index,
-                tag: slot_guard.tag,
-            },
-            slot_guard,
-        )
-    }
-}
-
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
@@ -207,7 +96,6 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
        let file = STORAGE_IO_TIME
            .with_label_values(&["open"])
            .observe_closure_duration(|| open_options.open(path))?;
@@ -223,15 +111,76 @@ impl VirtualFile {
        reopen_options.truncate(false);

        let vfile = VirtualFile {
-            handle: RwLock::new(handle),
+            handle: Arc::new(Mutex::new(Some(file))),
            pos: 0,
            path: path.to_path_buf(),
-            open_options: reopen_options,
            tenant_id,
            timeline_id,
        };

-        slot_guard.file.replace(file);
+        Ok(vfile)
+    }
+
+    /// Open a file in read-only mode. Like File::open.
+    pub async fn open_async(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        let mut options = tokio_epoll_uring::ops::open_at::OpenOptions::new();
+        options.read(true);
+        Self::open_with_options_async(path, options).await
+    }
+
+    /// Open a file with given options.
+    ///
+    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
+    /// they will be applied also when the file is subsequently re-opened, not only
+    /// on the first time. Make sure that's sane!
+    pub async fn open_with_options_async(
+        path: &Path,
+        open_options: tokio_epoll_uring::ops::open_at::OpenOptions,
+    ) -> Result<VirtualFile, std::io::Error> {
+        let path_str = path.to_string_lossy();
+        let parts = path_str.split('/').collect::<Vec<&str>>();
+        let tenant_id;
+        let timeline_id;
+        if parts.len() > 5 && parts[parts.len() - 5] == "tenants" {
+            tenant_id = parts[parts.len() - 4].to_string();
+            timeline_id = parts[parts.len() - 2].to_string();
+        } else {
+            tenant_id = "*".to_string();
+            timeline_id = "*".to_string();
+        }
+        let start = std::time::Instant::now();
+        let system = tokio_epoll_uring::thread_local_system().await;
+        let file: OwnedFd = system
+            .open(path, &open_options)
+            .await
+            .map_err(|e| match e {
+                tokio_epoll_uring::Error::Op(e) => e,
+                tokio_epoll_uring::Error::System(system) => {
+                    std::io::Error::new(std::io::ErrorKind::Other, system)
+                }
+            })?;
+        let file = File::from(file);
+        STORAGE_IO_TIME
+            .with_label_values(&["open"])
+            .observe(start.elapsed().as_secs_f64());
+
+        // Strip all options other than read and write.
+        //
+        // It would perhaps be nicer to check just for the read and write flags
+        // explicitly, but OpenOptions doesn't contain any functions to read flags,
+        // only to set them.
+        let mut reopen_options = open_options;
+        reopen_options.create(false);
+        reopen_options.create_new(false);
+        reopen_options.truncate(false);
+
+        let vfile = VirtualFile {
+            handle: Arc::new(Mutex::new(Some(file))),
+            pos: 0,
+            path: path.to_path_buf(),
+            tenant_id,
+            timeline_id,
+        };

        Ok(vfile)
    }
@@ -244,7 +193,9 @@ impl VirtualFile {
    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
        self.with_file("metadata", |file| file.metadata())?
    }
+}

+impl VirtualFile {
    /// Helper function that looks up the underlying File for this VirtualFile,
    /// opening it and evicting some other File if necessary. It calls 'func'
    /// with the physical File.
@@ -252,68 +203,9 @@ impl VirtualFile {
    where
        F: FnMut(&File) -> R,
    {
-        let open_files = get_open_files();
-
-        let mut handle_guard = {
-            // Read the cached slot handle, and see if the slot that it points to still
-            // contains our File.
-            //
-            // We only need to hold the handle lock while we read the current handle. If
-            // another thread closes the file and recycles the slot for a different file,
-            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().unwrap();
-            loop {
-                // Check if the slot contains our File
-                {
-                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().unwrap();
-                    if slot_guard.tag == handle.tag {
-                        if let Some(file) = &slot_guard.file {
-                            // Found a cached file descriptor.
-                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op])
-                                .observe_closure_duration(|| func(file)));
-                        }
-                    }
-                }
-
-                // The slot didn't contain our File. We will have to open it ourselves,
-                // but before that, grab a write lock on handle in the VirtualFile, so
-                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().unwrap();
-
-                // If another thread changed the handle while we were not holding the lock,
-                // then the handle might now be valid again. Loop back to retry.
-                if *handle_guard != handle {
-                    handle = *handle_guard;
-                    continue;
-                }
-                break handle_guard;
-            }
-        };
-
-        // We need to open the file ourselves. The handle in the VirtualFile is
-        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot();
-
-        // Open the physical file
-        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
-            .observe_closure_duration(|| self.open_options.open(&self.path))?;
-
-        // Perform the requested operation on it
-        let result = STORAGE_IO_TIME
+        return Ok(STORAGE_IO_TIME
            .with_label_values(&[op])
-            .observe_closure_duration(|| func(&file));
-
-        // Store the File in the slot and update the handle in the VirtualFile
-        // to point to it.
-        slot_guard.file.replace(file);
-
-        *handle_guard = handle;
-
-        Ok(result)
+            .observe_closure_duration(|| func(&*self.handle.lock().unwrap().as_ref().unwrap())));
    }

    pub fn remove(self) {
@@ -323,35 +215,6 @@ impl VirtualFile {
    }
 }

-impl Drop for VirtualFile {
-    /// If a VirtualFile is dropped, close the underlying file if it was open.
-    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
-
-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
-            STORAGE_IO_TIME
-                .with_label_values(&["close"])
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
-        }
-    }
-}
-
-impl Read for VirtualFile {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
-        let pos = self.pos;
-        let n = self.read_at(buf, pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
 impl Write for VirtualFile {
    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
        let pos = self.pos;
@@ -367,8 +230,8 @@ impl Write for VirtualFile {
    }
 }

-impl Seek for VirtualFile {
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+impl VirtualFile {
+    pub fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
        match pos {
            SeekFrom::Start(offset) => {
                self.pos = offset;
@@ -392,11 +255,113 @@ impl Seek for VirtualFile {
        }
        Ok(self.pos)
    }
-}

-impl FileExt for VirtualFile {
+    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+    pub fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
+        while !buf.is_empty() {
+            match self.read_at(buf, offset) {
+                Ok(0) => {
+                    return Err(Error::new(
+                        std::io::ErrorKind::UnexpectedEof,
+                        "failed to fill whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    buf = &mut buf[n..];
+                    offset += n as u64;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+    pub async fn read_exact_at_async(
+        &self,
+        mut write_guard: crate::buffer_pool::Buffer,
+        offset: u64,
+    ) -> Result<crate::buffer_pool::Buffer, Error> {
+        let file = self.handle.lock().unwrap().take().unwrap();
+        let put_back = AtomicBool::new(false);
+        let put_back_ref = &put_back;
+        scopeguard::defer! {
+            if !put_back_ref.load(std::sync::atomic::Ordering::Relaxed) {
+                panic!("mut put self.handle back")
+            }
+        };
+        let system = tokio_epoll_uring::thread_local_system().await;
+        struct PageWriteGuardBuf {
+            buf: crate::buffer_pool::Buffer,
+            init_up_to: usize,
+        }
+        unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
+            fn stable_ptr(&self) -> *const u8 {
+                self.buf.as_ptr()
+            }
+            fn bytes_init(&self) -> usize {
+                self.init_up_to
+            }
+            fn bytes_total(&self) -> usize {
+                self.buf.len()
+            }
+        }
+        unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
+            fn stable_mut_ptr(&mut self) -> *mut u8 {
+                self.buf.as_mut_ptr()
+            }
+
+            unsafe fn set_init(&mut self, pos: usize) {
+                assert!(pos <= self.buf.len());
+                self.init_up_to = pos;
+            }
+        }
+        let buf = PageWriteGuardBuf {
+            buf: write_guard,
+            init_up_to: 0,
+        };
+        let ((file, buf), res) = system.read(file.into(), offset, buf).await;
+        let PageWriteGuardBuf {
+            buf: write_guard,
+            init_up_to,
+        } = buf;
+        if let Ok(num_read) = res {
+            assert!(init_up_to <= num_read);
+        }
+        let replaced = self.handle.lock().unwrap().replace(File::from(file));
+        assert!(replaced.is_none());
+        put_back.store(true, std::sync::atomic::Ordering::Relaxed);
+        res.map(|_| write_guard)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
+    }
+
+    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
+    pub fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+        while !buf.is_empty() {
+            match self.write_at(buf, offset) {
+                Ok(0) => {
+                    return Err(Error::new(
+                        std::io::ErrorKind::WriteZero,
+                        "failed to write whole buffer",
+                    ));
+                }
+                Ok(n) => {
+                    buf = &buf[n..];
+                    offset += n as u64;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = self.with_file("read", |file| file.read_at(buf, offset))?;
+        let result = self.with_file("read", |file| {
+            tracing::info!("sync read\n{}", std::backtrace::Backtrace::force_capture());
+            file.read_at(buf, offset)
+        })?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -405,7 +370,7 @@ impl FileExt for VirtualFile {
        result
    }

-    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
+    pub fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
        let result = self.with_file("write", |file| file.write_at(buf, offset))?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
@@ -415,256 +380,3 @@ impl FileExt for VirtualFile {
        result
    }
 }
-
-impl OpenFiles {
-    fn new(num_slots: usize) -> OpenFiles {
-        let mut slots = Box::new(Vec::with_capacity(num_slots));
-        for _ in 0..num_slots {
-            let slot = Slot {
-                recently_used: AtomicBool::new(false),
-                inner: RwLock::new(SlotInner { tag: 0, file: None }),
-            };
-            slots.push(slot);
-        }
-
-        OpenFiles {
-            next: AtomicUsize::new(0),
-            slots: Box::leak(slots),
-        }
-    }
-}
-
-///
-/// Initialize the virtual file module. This must be called once at page
-/// server startup.
-///
-pub fn init(num_slots: usize) {
-    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
-        panic!("virtual_file::init called twice");
-    }
-}
-
-const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
-
-// Get a handle to the global slots array.
-fn get_open_files() -> &'static OpenFiles {
-    //
-    // In unit tests, page server startup doesn't happen and no one calls
-    // virtual_file::init(). Initialize it here, with a small array.
-    //
-    // This applies to the virtual file tests below, but all other unit
-    // tests too, so the virtual file facility is always usable in
-    // unit tests.
-    //
-    if cfg!(test) {
-        OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
-    } else {
-        OPEN_FILES.get().expect("virtual_file::init not called yet")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use rand::seq::SliceRandom;
-    use rand::thread_rng;
-    use rand::Rng;
-    use std::sync::Arc;
-    use std::thread;
-
-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
-    where
-        FD: Read,
-    {
-        let mut buf = String::new();
-        vfile.read_to_string(&mut buf)?;
-        Ok(buf)
-    }
-
-    // Helper function to slurp a portion of a file into a string
-    fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
-    where
-        FD: FileExt,
-    {
-        let mut buf = Vec::new();
-        buf.resize(len, 0);
-        vfile.read_exact_at(&mut buf, pos)?;
-        Ok(String::from_utf8(buf).unwrap())
-    }
-
-    #[test]
-    fn test_virtual_files() -> Result<(), Error> {
-        // The real work is done in the test_files() helper function. This
-        // allows us to run the same set of tests against a native File, and
-        // VirtualFile. We trust the native Files and wouldn't need to test them,
-        // but this allows us to verify that the operations return the same
-        // results with VirtualFiles as with native Files. (Except that with
-        // native files, you will run out of file descriptors if the ulimit
-        // is low enough.)
-        test_files("virtual_files", |path, open_options| {
-            VirtualFile::open_with_options(path, open_options)
-        })
-    }
-
-    #[test]
-    fn test_physical_files() -> Result<(), Error> {
-        test_files("physical_files", |path, open_options| {
-            open_options.open(path)
-        })
-    }
-
-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
-    where
-        FD: Read + Write + Seek + FileExt,
-        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
-    {
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
-        std::fs::create_dir_all(&testdir)?;
-
-        let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
-            &path_a,
-            OpenOptions::new().write(true).create(true).truncate(true),
-        )?;
-        file_a.write_all(b"foobar")?;
-
-        // cannot read from a file opened in write-only mode
-        assert!(read_string(&mut file_a).is_err());
-
-        // Close the file and re-open for reading
-        let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
-
-        // cannot write to a file opened in read-only mode
-        assert!(file_a.write(b"bar").is_err());
-
-        // Try simple read
-        assert_eq!("foobar", read_string(&mut file_a)?);
-
-        // It's positioned at the EOF now.
-        assert_eq!("", read_string(&mut file_a)?);
-
-        // Test seeks.
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
-        assert_eq!("ar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
-        assert_eq!("bar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Test erroneous seeks to before byte 0
-        assert!(file_a.seek(SeekFrom::End(-7)).is_err());
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
-
-        // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Create another test file, and try FileExt functions on it.
-        let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
-            &path_b,
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true)
-                .truncate(true),
-        )?;
-        file_b.write_all_at(b"BAR", 3)?;
-        file_b.write_all_at(b"FOO", 0)?;
-
-        assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
-
-        // Open a lot of files, enough to cause some evictions. (Or to be precise,
-        // open the same file many times. The effect is the same.)
-        //
-        // leave file_a positioned at offset 1 before we start
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-
-        let mut vfiles = Vec::new();
-        for _ in 0..100 {
-            let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
-            assert_eq!("FOOBAR", read_string(&mut vfile)?);
-            vfiles.push(vfile);
-        }
-
-        // make sure we opened enough files to definitely cause evictions.
-        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
-
-        // The underlying file descriptor for 'file_a' should be closed now. Try to read
-        // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Check that all the other FDs still work too. Use them in random order for
-        // good measure.
-        vfiles.as_mut_slice().shuffle(&mut thread_rng());
-        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
-        }
-
-        Ok(())
-    }
-
-    /// Test using VirtualFiles from many threads concurrently. This tests both using
-    /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
-    /// VirtualFile from multiple threads concurrently.
-    #[test]
-    fn test_vfile_concurrency() -> Result<(), Error> {
-        const SIZE: usize = 8 * 1024;
-        const VIRTUAL_FILES: usize = 100;
-        const THREADS: usize = 100;
-        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
-
-        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
-        std::fs::create_dir_all(&testdir)?;
-
-        // Create a test file.
-        let test_file_path = testdir.join("concurrency_test_file");
-        {
-            let file = File::create(&test_file_path)?;
-            file.write_all_at(&SAMPLE, 0)?;
-        }
-
-        // Open the file many times.
-        let mut files = Vec::new();
-        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
-            files.push(f);
-        }
-        let files = Arc::new(files);
-
-        // Launch many threads, and use the virtual files concurrently in random order.
-        let mut threads = Vec::new();
-        for threadno in 0..THREADS {
-            let builder =
-                thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
-
-            let files = files.clone();
-            let thread = builder
-                .spawn(move || {
-                    let mut buf = [0u8; SIZE];
-                    let mut rng = rand::thread_rng();
-                    for _ in 1..1000 {
-                        let f = &files[rng.gen_range(0..files.len())];
-                        f.read_exact_at(&mut buf, 0).unwrap();
-                        assert!(buf == SAMPLE);
-                    }
-                })
-                .unwrap();
-            threads.push(thread);
-        }
-
-        for thread in threads {
-            thread.join().unwrap();
-        }
-
-        Ok(())
-    }
-}
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1524,7 +1524,7 @@ class NeonPageserver(PgProtocol):
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
-            ".*compaction_loop.*Compaction failed, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
--- a/test_runner/performance/test_pageserver_startup_many_tenants.py
+++ b/test_runner/performance/test_pageserver_startup_many_tenants.py
@@ -0,0 +1,32 @@
+import queue
+import threading
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.types import TenantId
+
+
+def test_pageserver_startup_many_tenants(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+
+    #  below doesn't work because summaries contain tenant and timeline ids and we check for them
+
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
+    pshttp = env.pageserver.http_client()
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table foo(b text)")
+    for i in range(0, 8):
+        ep.safe_psql("insert into foo(b) values ('some text')")
+        # pg_bin.run_capture(["pgbench", "-i", "-s1", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+        pshttp.timeline_checkpoint(tenant_id, timeline_id)
+    ep.stop_and_destroy()
+
+    env.pageserver.stop()
+    for sk in env.safekeepers:
+        sk.stop()
+
+    tenant_dir = env.repo_dir / "tenants" / str(env.initial_tenant)
+
+    for i in range(0, 20_000):
+        import shutil
+
+        shutil.copytree(tenant_dir, tenant_dir.parent / str(TenantId.generate()))
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -22,7 +22,7 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:

    # eviction might be the first one after an attach to access the layers
    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
+        ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
    )
    assert isinstance(env.remote_storage, LocalFsStorage)
    return env
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):

    env.pageserver.allowed_errors.extend(
        [
-            ".*layer loading failed:.*",
+            ".*Failed to load delta layer.*",
            ".*could not find data for key.*",
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
@@ -99,7 +99,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # Third timeline will also fail during basebackup, because the layer file is corrupt.
    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
    # (We don't check layer file contents on startup, when loading the timeline)
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="Failed to load delta layer") as err:
        pg3.start()
    log.info(
        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,137 +2,35 @@ import time

 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from requests.exceptions import ConnectionError


+# Test duplicate layer detection
+#
+# This test sets fail point at the end of first compaction phase:
+# after flushing new L1 layers but before deletion of L0 layers
+# it should cause generation of duplicate L1 layer by compaction after restart.
@pytest.mark.timeout(600)
-def test_compaction_duplicates_all(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    """
-    Makes compact_level0_phase1 return input layers as the output layers with a
-    failpoint as if those L0 inputs would had all been recreated when L1s were
-    supposed to be created.
-    """
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
-        test_name="test_compaction_duplicates_all",
-    )
+def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()

-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
+    # Use aggressive compaction and checkpoint settings
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
            "checkpoint_distance": f"{1024 ** 2}",
            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "0 s",
+            "compaction_period": "5 s",
            "compaction_threshold": "3",
        }
    )
-    pageserver_http = env.pageserver.http_client()
-
-    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline

    pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return"))
-    # pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit"))

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    connstr = endpoint.connstr(options="-csynchronous_commit=off")
    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])

-    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    time.sleep(10)  # let compaction to be performed
    assert env.pageserver.log_contains("compact-level0-phase1-return-same")

-
-def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    """
-    This test sets fail point at the end of first compaction phase:
-    after flushing new L1 layers but before deletion of L0 layers
-    it should cause generation of duplicate L1 layer by compaction after restart.
-    """
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
-        test_name="test_duplicate_layers",
-    )
-
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
-            "compaction_period": "0 s",
-            "compaction_threshold": "3",
-        }
-    )
-    pageserver_http = env.pageserver.http_client()
-
-    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
-
-    pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit"))
-
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
-
-    with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
-        pageserver_http.timeline_compact(tenant_id, timeline_id)
-
-    # pageserver has already exited at this point
-    env.pageserver.stop()
-
-    # now the duplicate L1 has been created, but is not yet uploaded
-    assert isinstance(env.remote_storage, LocalFsStorage)
-
-    # path = env.remote_storage.timeline_path(tenant_id, timeline_id)
-    l1_found = None
-    for path in env.timeline_dir(tenant_id, timeline_id).iterdir():
-        if path.name == "metadata" or path.name.startswith("ephemeral-"):
-            continue
-
-        if len(path.suffixes) > 0:
-            # temp files
-            continue
-
-        [key_range, lsn_range] = path.name.split("__", maxsplit=1)
-
-        if "-" not in lsn_range:
-            # image layer
-            continue
-
-        [key_start, key_end] = key_range.split("-", maxsplit=1)
-
-        if key_start == "0" * 36 and key_end == "F" * 36:
-            # L0
-            continue
-
-        assert l1_found is None, f"found multiple L1: {l1_found.name} and {path.name}"
-        l1_found = path
-
-    assert l1_found is not None, "failed to find L1 locally"
-    original_created_at = l1_found.stat()[8]
-
-    uploaded = env.remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
-    assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
-
-    # give room for fs timestamps
-    time.sleep(1)
-
-    env.pageserver.start()
-    warning = f".*duplicated L1 layer layer={l1_found.name}"
-    env.pageserver.allowed_errors.append(warning)
-
-    pageserver_http.timeline_compact(tenant_id, timeline_id)
-    # give time for log flush
-    time.sleep(1)
-
-    env.pageserver.log_contains(warning)
-
-    overwritten_at = l1_found.stat()[8]
-    assert original_created_at < overwritten_at, "expected the L1 to be overwritten"
-
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-
-    uploaded_at = uploaded.stat()[8]
-    assert overwritten_at <= uploaded_at, "expected the L1 to finally be uploaded"
-
-    # why does compaction not wait for uploads? probably so that we can compact
-    # faster than we can upload in some cases.
-    #
-    # timeline_compact should wait for uploads as well
+    pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr])
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -256,34 +256,34 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
    ps_http.evict_all_layers(tenant_id, timeline_id)

    def ensure_resident_and_remote_size_metrics():
+        log.info("ensure that all the layers are gone")
        resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
        # we have disabled all background loops, so, this should hold
-        assert len(resident_layers) == 0, "ensure that all the layers are gone"
+        assert len(resident_layers) == 0

        info = ps_http.layer_map_info(tenant_id, timeline_id)
        log.info("layer map dump: %s", info)

+        log.info("ensure that resident_physical_size metric is zero")
        resident_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_resident_physical_size"
        )
-        assert (
-            resident_physical_size_metric == 0
-        ), "ensure that resident_physical_size metric is zero"
+        assert resident_physical_size_metric == 0
+        log.info("ensure that resident_physical_size metric corresponds to layer map dump")
        assert resident_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
-        ), "ensure that resident_physical_size metric corresponds to layer map dump"
+            [layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
+        )

+        log.info("ensure that remote_physical_size metric matches layer map")
        remote_physical_size_metric = ps_http.get_timeline_metric(
            tenant_id, timeline_id, "pageserver_remote_physical_size"
        )
+        log.info("ensure that remote_physical_size metric corresponds to layer map dump")
        assert remote_physical_size_metric == sum(
            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
-        ), "ensure that remote_physical_size metric corresponds to layer map dump"
+        )

    log.info("before runnning GC, ensure that remote_physical size is zero")
-    # leaving index_part.json upload from successful compaction out will show
-    # up here as a mismatch between remove_physical_size and summed up layermap
-    # size
    ensure_resident_and_remote_size_metrics()

    log.info("run GC")
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -13,12 +13,13 @@ from fixtures.neon_fixtures import (
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    wait_for_last_record_lsn,
    wait_for_upload,
    wait_for_upload_queue_empty,
+    wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
 from fixtures.types import Lsn
@@ -390,7 +391,7 @@ def test_download_remote_layers_api(
    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
    env.pageserver.allowed_errors.extend(
        [
-            ".*download failed: downloading evicted layer file failed.*",
+            f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint",
            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
        ]
    )
@@ -657,5 +658,62 @@ def test_compaction_downloads_on_demand_with_image_creation(
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
+
+    See: https://github.com/neondatabase/neon/issues/3533
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_failure_to_replace",
+    )
+
+    # disable gc and compaction via default tenant config because config is lost while detaching
+    # so that compaction will not be the one to download the layer but the http handler is
+    neon_env_builder.pageserver_config_override = (
+        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    pageserver_http = env.pageserver.http_client()
+
+    # remove layers so that they will be redownloaded
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
+
+    # requesting details with non-incremental size should trigger a download of the only layer
+    # this will need to be adjusted if an index for logical sizes is ever implemented
+    with pytest.raises(PageserverApiException):
+        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
+        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
+        # but should it be added back, we would wait for 15s here.
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
+
+    actual_message = ".* ERROR .*layermap-replace-notfound"
+    assert env.pageserver.log_contains(actual_message) is not None
+    env.pageserver.allowed_errors.append(actual_message)
+
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
+    )
+    # this might get to run and attempt on-demand, but not always
+    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
+
+    # if the above returned, then we didn't have a livelock, and all is well
+
+
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -603,7 +603,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    log.info("sending delete request")
    checkpoint_allowed_to_fail.set()
    env.pageserver.allowed_errors.append(
-        ".* ERROR .*Error processing HTTP request: InternalServerError\\(The timeline or pageserver is shutting down"
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
    )

    # Generous timeout, because currently deletions can get blocked waiting for compaction
@@ -861,8 +861,10 @@ def test_compaction_delete_before_upload(
    # Ensure that this actually terminates
    wait_upload_queue_empty(client, tenant_id, timeline_id)

-    # fixed in #4938
-    assert not env.pageserver.log_contains(
+    # For now we are hitting this message.
+    # Maybe in the future the underlying race condition will be fixed,
+    # but until then, ensure that this message is hit instead.
+    assert env.pageserver.log_contains(
        "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
    )

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -688,7 +688,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
    # temporarily detached produces these errors in the pageserver log.
    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: (Stopping|Broken).*"
+        f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*"
    )

    # ignore the tenant and remove its metadata
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -239,7 +239,9 @@ def test_tenant_redownloads_truncated_file_on_startup(

    assert isinstance(env.remote_storage, LocalFsStorage)

-    env.pageserver.allowed_errors.append(".*removing local file .* because .*")
+    env.pageserver.allowed_errors.append(
+        ".*removing local file .* because it has unexpected length.*"
+    )

    # FIXME: Are these expected?
    env.pageserver.allowed_errors.append(
--- a/vm-cgconfig.conf
+++ b/vm-cgconfig.conf
@@ -1,12 +0,0 @@
-# Configuration for cgroups in VM compute nodes
-group neon-postgres {
-    perm {
-        admin {
-            uid = vm-informant;
-        }
-        task {
-            gid = users;
-        }
-    }
-    memory {}
-}
Author	SHA1	Message	Date
Christian Schwarz	02205e9191	measured BACKGROUND_RUNTIME performance using `wrk` Launch wrk from command line 3-4 seconds after the load starts. => blocking of executor threads is clearly visible, my branch performs _much_ better. baseline: commit `15b8618d25` (HEAD -> problame/loadtest-baseline, origin/problame/loadtest-baseline, main) neon-main (compaction semaphore disabled!) admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342 Running 10s test @ http://localhost:2342 2 threads and 10 connections Thread Stats Avg Stdev Max +/- Stdev Latency 71.42ms 15.97ms 125.18ms 70.82% Req/Sec 41.44 28.85 101.00 57.35% Latency Distribution 50% 72.53ms 75% 82.07ms 90% 91.44ms 99% 116.56ms 291 requests in 10.01s, 22.73KB read Socket errors: connect 0, read 0, write 0, timeout 10 Requests/sec: 29.07 Transfer/sec: 2.27KB this branch (comapction semaphore also disabled!): admin@ip-172-31-13-23:[~/neon]: wrk --latency http://localhost:2342 Running 10s test @ http://localhost:2342 2 threads and 10 connections Thread Stats Avg Stdev Max +/- Stdev Latency 45.74ms 64.13ms 293.44ms 83.27% Req/Sec 442.81 258.18 1.32k 69.79% Latency Distribution 50% 2.92ms 75% 75.52ms 90% 148.03ms 99% 248.50ms 8641 requests in 10.01s, 675.08KB read Requests/sec: 862.81 Transfer/sec: 67.41KB	2023-08-31 08:02:17 +00:00
Christian Schwarz	66c501f5b8	HACK: BACKGROUND_RUNTIME webserver to measure response time using `wrk`	2023-08-31 07:50:13 +00:00
Christian Schwarz	887f464825	use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking This makes Delta/Image ::load fns fully tokio-epoll-uring	2023-08-29 19:14:35 +00:00
Christian Schwarz	0b8ff8dbe0	Revert "switch back to spawn_blocking to make the comparison" This reverts commit `60971e282e`.	2023-08-29 16:42:14 +00:00
Christian Schwarz	60971e282e	switch back to spawn_blocking to make the comparison	2023-08-29 16:00:21 +00:00
Christian Schwarz	189aa1b077	with the page cache removed, we spend almost 0 time in futex	2023-08-29 15:53:41 +00:00
Christian Schwarz	9f087f93f8	buffer pool impl: re-use allocations	2023-08-29 15:49:05 +00:00
Christian Schwarz	fa1fb214b3	also rip out memoization code and make rest compile	2023-08-29 15:28:14 +00:00
Christian Schwarz	4db24c9de0	RIP out page cache, but keep memoization code (doesn't compile)	2023-08-29 15:27:55 +00:00
Christian Schwarz	5f920a9993	profile.release=debug	2023-08-29 15:05:02 +00:00
Christian Schwarz	2aac082385	disable concurrent compactions limiter	2023-08-29 13:05:18 +00:00
Christian Schwarz	99f8f87ba5	spawn_blocking-based file open for image and delta layer loading	2023-08-29 12:49:53 +00:00
Christian Schwarz	e7e1df2a79	tokio_epoll_uring for read path	2023-08-29 12:24:30 +00:00
Christian Schwarz	10ee8f7981	FileBlockReaderFile is not needed, was doing all the sync IO	2023-08-29 11:04:16 +00:00
Christian Schwarz	a5b6e32b01	PoC using spawn_blocking	2023-08-29 09:59:53 +00:00
Christian Schwarz	bb88e5bf57	try to convert to async, now lifetime errors because buffer lifetime is insufficient	2023-08-29 09:53:01 +00:00
Christian Schwarz	876efcfc0a	REPRO the problem: , uses 430GB of space; 4 seconds load time; constant 20kIOPS after ~20s	2023-08-29 09:39:39 +00:00
Christian Schwarz	4d69192ae5	QUICK HACK: rip out virtual file cache	2023-08-29 09:37:08 +00:00
Christian Schwarz	2d5d046062	WIP switch to tokio::sync::RwLock	2023-08-29 11:21:29 +02:00
Christian Schwarz	d8b8a203a4	WIP: async with_file & read_at_async, problem is that FileSlotGuard is not Send	2023-08-29 11:14:07 +02:00
Christian Schwarz	11e9b25f2b	refactor: get_file_guard & base with_file on it	2023-08-29 11:02:42 +02:00
Christian Schwarz	235baffbf4	make the read path async, except the read_at impl	2023-08-29 10:50:21 +02:00
Christian Schwarz	5be0f9d69a	read_at need not be public	2023-08-29 10:48:29 +02:00
Christian Schwarz	e91e4d0b96	move code around to minimize diff	2023-08-29 10:42:23 +02:00
Arpad Müller	edbe3d2f76	Remove Read impl that was only used in one place	2023-08-29 01:52:39 +02:00
Arpad Müller	0d9fa95454	Move used FileExt functions to inherent impls	2023-08-29 01:52:39 +02:00
Arpad Müller	e983b3cc2e	Don't use generics bounded by trait	2023-08-29 01:52:39 +02:00
Arpad Müller	a362ab9169	Move VirtualFile::seek to inherent function	2023-08-28 22:46:36 +02:00
Arpad Müller	0cfc9edcb8	Make read_blk and parts of the page cache async The returned PageReadGuard is not Send so we change the locks used for the SlotInner's in the page cache to the ones from tokio. Also, make read_blk async.	2023-08-28 22:43:39 +02:00
Em Sharnoff	e40ee7c3d1	remove unused file 'vm-cgconfig.conf' (#5127 ) Honestly no clue why it's still here, should have been removed ages ago. This is handled by vm-builder now.	2023-08-28 13:04:57 -07:00
Christian Schwarz	0fe3b3646a	page cache: don't proactively evict EphemeralFile pages (#5129 ) Before this patch, when dropping an EphemeralFile, we'd scan the entire `slots` to proactively evict its pages (`drop_buffers_for_immutable`). This was _necessary_ before #4994 because the page cache was a write-back cache: we'd be deleting the EphemeralFile from disk after, so, if we hadn't evicted its pages before that, write-back in `find_victim` wouldhave failed. But, since #4994, the page cache is a read-only cache, so, it's safe to keep read-only data cached. It's never going to get accessed again and eventually, `find_victim` will evict it. The only remaining advantage of `drop_buffers_for_immutable` over relying on `find_victim` is that `find_victim` has to do the clock page replacement iterations until the count reaches 0, whereas `drop_buffers_for_immutable` can kick the page out right away. However, weigh that against the cost of `drop_buffers_for_immutable`, which currently scans the entire `slots` array to find the EphemeralFile's pages. Alternatives have been proposed in #5122 and #5128, but, they come with their own overheads & trade-offs. Also, the real reason why we're looking into this piece of code is that we want to make the slots rwlock async in #5023. Since `drop_buffers_for_immutable` is called from drop, and there is no async drop, it would be nice to not have to deal with this. So, let's just stop doing `drop_buffers_for_immutable` and observe the performance impact in benchmarks.	2023-08-28 20:42:18 +02:00
Em Sharnoff	529f8b5016	compute_ctl: Fix switched vm-monitor args (#5117 ) Small switcheroo from #4946.	2023-08-28 14:55:41 +02:00