Simplify PR template

2026-02-16 00:50:36 +00:00 · 2023-10-30 11:30:50 -04:00
80 changed files with 706 additions and 5003 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,14 +1,3 @@
 ## Problem

 ## Summary of changes
-
-## Checklist before requesting a review
-
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
-
-## Checklist before merging
-
- [ ] Do not forget to reformat commit message to not include the above checklist
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1609,6 +1609,16 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "ctor"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "ctr"
 version = "0.6.0"
@@ -2704,10 +2714,11 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
+ "cfg-if",
 "value-bag",
 ]

@@ -3243,7 +3254,6 @@ dependencies = [
 "num_cpus",
 "once_cell",
 "pageserver_api",
- "pageserver_compaction",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
@@ -3302,52 +3312,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "pageserver_compaction"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-compression",
- "async-stream",
- "async-trait",
- "byteorder",
- "bytes",
- "chrono",
- "clap",
- "const_format",
- "consumption_metrics",
- "criterion",
- "crossbeam-utils",
- "either",
- "fail",
- "flate2",
- "futures",
- "git-version",
- "hex",
- "hex-literal",
- "humantime",
- "humantime-serde",
- "itertools",
- "metrics",
- "once_cell",
- "pin-project-lite",
- "rand 0.8.5",
- "smallvec",
- "svg_fmt",
- "sync_wrapper",
- "thiserror",
- "tokio",
- "tokio-io-timeout",
- "tokio-util",
- "tracing",
- "tracing-error",
- "tracing-subscriber",
- "url",
- "utils",
- "walkdir",
- "workspace_hack",
-]
-
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -3597,7 +3561,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3610,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3621,7 +3585,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3639,7 +3603,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4466,7 +4430,6 @@ dependencies = [
 "itertools",
 "pageserver",
 "rand 0.8.5",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
@@ -4525,7 +4488,6 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -5445,7 +5407,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6049,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

 [[package]]
 name = "value-bag"
-version = "1.4.2"
+version = "1.0.0-alpha.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
+checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
+dependencies = [
+ "ctor",
+ "version_check",
+]

 [[package]]
 name = "vcpkg"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,6 @@ members = [
    "compute_tools",
    "control_plane",
    "pageserver",
-    "pageserver/compaction",
    "pageserver/ctl",
    "proxy",
    "safekeeper",
@@ -162,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -176,7 +175,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
-pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
@@ -204,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -278,26 +278,32 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
-                Some(
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor")
-                )
-            } else {
-                None
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
            };

            // This token is used internally by the monitor to clean up all threads
@@ -308,7 +314,8 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.clone(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +487,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", format!("Bearer {}", jwt))
+        .header("Authorization", jwt)
        .send()
        .map_err(|e| {
            (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};

@@ -222,25 +221,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
            generation: 0,
        });

-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
+    if attach_req.node_id.is_some() {
        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
    }
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach-hook", handle_attach_hook)
 }

 #[tokio::main]
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -345,11 +345,6 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            compaction_algorithm: settings
-                .remove("compaction_algorithm")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -445,11 +440,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -1,108 +0,0 @@
-# Updating Postgres
-
-## Minor Versions
-
-When upgrading to a new minor version of Postgres, please follow these steps:
-
-_Example: 15.4 is the new minor version to upgrade to from 15.3._
-
-1. Clone the Neon Postgres repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/postgres.git
-    ```
-
-1. Add the Postgres upstream remote.
-
-    ```shell
-    git remote add upstream https://git.postgresql.org/git/postgresql.git
-    ```
-
-1. Create a new branch based on the stable branch you are updating.
-
-    ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
-    ```
-
-1. Tag the last commit on the stable branch you are updating.
-
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
-
-    ```shell
-    git fetch upstream REL_15_4
-    git rebase REL_15_4
-    ```
-
-1. Run the Postgres test suite to make sure our commits have not affected
-Postgres in a negative way.
-
-    ```shell
-    make check
-    # OR
-    meson test -C builddir
-    ```
-
-1. Push your branch to the Neon Postgres repository.
-
-    ```shell
-    git push origin my-branch
-    ```
-
-1. Clone the Neon repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/neon.git
-    ```
-
-1. Create a new branch.
-
-1. Change the `revisions.json` file to point at the HEAD of your Postgres
-branch.
-
-1. Update the Git submodule.
-
-    ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
-    git submodule update --remote vendor/postgres-v15
-    ```
-
-1. Run the Neon test suite to make sure that Neon is still good to go on this
-minor Postgres release.
-
-    ```shell
-    ./scripts/poetry -k pg15
-    ```
-
-1. Commit your changes.
-
-1. Create a pull request, and wait for CI to go green.
-
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
-
-    ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
-    ```
-
-    It may require disabling various branch protections.
-
-1. Update your Neon PR to point at the branches.
-
-    ```shell
-    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
-    git commit --amend --no-edit
-    git push --force origin
-    ```
-
-1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -227,8 +227,6 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
-    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<serde_json::Value>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -326,7 +324,6 @@ impl TenantConfigRequest {
            compaction_target_size: None,
            compaction_period: None,
            compaction_threshold: None,
-            compaction_algorithm: None,
            gc_horizon: None,
            gc_period: None,
            image_creation_threshold: None,
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
-            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

-use bytes::{Bytes, BytesMut};
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
    }
 }

-/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-pub struct ChannelWriter {
-    buffer: BytesMut,
-    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
-    written: usize,
-}
-
-impl ChannelWriter {
-    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-        assert_ne!(buf_len, 0);
-        ChannelWriter {
-            // split about half off the buffer from the start, because we flush depending on
-            // capacity. first flush will come sooner than without this, but now resizes will
-            // have better chance of picking up the "other" half. not guaranteed of course.
-            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-            tx,
-            written: 0,
-        }
-    }
-
-    pub fn flush0(&mut self) -> std::io::Result<usize> {
-        let n = self.buffer.len();
-        if n == 0 {
-            return Ok(0);
-        }
-
-        tracing::trace!(n, "flushing");
-        let ready = self.buffer.split().freeze();
-
-        // not ideal to call from blocking code to block_on, but we are sure that this
-        // operation does not spawn_blocking other tasks
-        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-            // throttle sending to allow reuse of our buffer in `write`.
-            self.tx.reserve().await.map_err(|_| ())?;
-
-            // now the response task has picked up the buffer and hopefully started
-            // sending it to the client.
-            Ok(())
-        });
-        if res.is_err() {
-            return Err(std::io::ErrorKind::BrokenPipe.into());
-        }
-        self.written += n;
-        Ok(n)
-    }
-
-    pub fn flushed_bytes(&self) -> usize {
-        self.written
-    }
-}
-
-impl std::io::Write for ChannelWriter {
-    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-        let remaining = self.buffer.capacity() - self.buffer.len();
-
-        let out_of_space = remaining < buf.len();
-
-        let original_len = buf.len();
-
-        if out_of_space {
-            let can_still_fit = buf.len() - remaining;
-            self.buffer.extend_from_slice(&buf[..can_still_fit]);
-            buf = &buf[can_still_fit..];
-            self.flush0()?;
-        }
-
-        // assume that this will often under normal operation just move the pointer back to the
-        // beginning of allocation, because previous split off parts are already sent and
-        // dropped.
-        self.buffer.extend_from_slice(buf);
-        Ok(original_len)
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.flush0().map(|_| ())
-    }
-}
-
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }
+
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,7 +1,4 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
-};
+use std::sync::{Arc, Mutex, MutexGuard};
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -13,7 +10,6 @@ use tokio::sync::Semaphore;
 /// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
 pub struct OnceCell<T> {
    inner: Mutex<Inner<T>>,
-    initializers: AtomicUsize,
 }

 impl<T> Default for OnceCell<T> {
@@ -21,7 +17,6 @@ impl<T> Default for OnceCell<T> {
    fn default() -> Self {
        Self {
            inner: Default::default(),
-            initializers: AtomicUsize::new(0),
        }
    }
 }
@@ -54,7 +49,6 @@ impl<T> OnceCell<T> {
                init_semaphore: Arc::new(sem),
                value: Some(value),
            }),
-            initializers: AtomicUsize::new(0),
        }
    }

@@ -66,8 +60,8 @@ impl<T> OnceCell<T> {
    /// Initialization is panic-safe and cancellation-safe.
    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+        F: FnOnce() -> Fut,
+        Fut: std::future::Future<Output = Result<T, E>>,
    {
        let sem = {
            let guard = self.inner.lock().unwrap();
@@ -77,61 +71,29 @@ impl<T> OnceCell<T> {
            guard.init_semaphore.clone()
        };

-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
+        let permit = sem.acquire_owned().await;
+        if permit.is_err() {
+            let guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_some(),
+                "semaphore got closed, must be initialized"
+            );
+            return Ok(Guard(guard));
+        } else {
+            // now we try
+            let value = factory().await?;

-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
+            let mut guard = self.inner.lock().unwrap();
+            assert!(
+                guard.value.is_none(),
+                "we won permit, must not be initialized"
+            );
+            guard.value = Some(value);
+            guard.init_semaphore.close();
+            Ok(Guard(guard))
        }
    }

-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
-    /// to complete initializing the inner value.
-    ///
-    /// # Panics
-    ///
-    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
-
-        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
-        // give more permits right now.
-        if guard.init_semaphore.try_acquire().is_ok() {
-            drop(guard);
-            panic!("permit is of wrong origin");
-        }
-
-        Self::set0(value, guard)
-    }
-
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
-        if guard.value.is_some() {
-            drop(guard);
-            unreachable!("we won permit, must not be initialized");
-        }
-        guard.value = Some(value);
-        guard.init_semaphore.close();
-        Guard(guard)
-    }
-
    /// Returns a guard to an existing initialized value, if any.
    pub fn get(&self) -> Option<Guard<'_, T>> {
        let guard = self.inner.lock().unwrap();
@@ -141,28 +103,6 @@ impl<T> OnceCell<T> {
            None
        }
    }
-
-    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
-    pub fn initializer_count(&self) -> usize {
-        self.initializers.load(Ordering::Relaxed)
-    }
-}
-
-/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
-/// initializing task for example at the end of initialization.
-struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
-
-impl<'a, T> CountWaitingInitializers<'a, T> {
-    fn start(target: &'a OnceCell<T>) -> Self {
-        target.initializers.fetch_add(1, Ordering::Relaxed);
-        CountWaitingInitializers(target)
-    }
-}
-
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
-    fn drop(&mut self) {
-        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
-    }
 }

 /// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -195,7 +135,7 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
        let mut swapped = Inner::default();
        let permit = swapped
            .init_semaphore
@@ -205,14 +145,11 @@ impl<'a, T> Guard<'a, T> {
        std::mem::swap(&mut *self.0, &mut swapped);
        swapped
            .value
-            .map(|v| (v, InitPermit(permit)))
+            .map(|v| (v, permit))
            .expect("guard is not created unless value has been initialized")
    }
 }

-/// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -248,11 +185,11 @@ mod tests {
                    barrier.wait().await;
                    let won = {
                        let g = cell
-                            .get_or_init(|permit| {
+                            .get_or_init(|| {
                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
                                async {
                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>((i, permit))
+                                    Ok::<_, Infallible>(i)
                                }
                            })
                            .await
@@ -306,7 +243,7 @@ mod tests {
        deinitialization_started.wait().await;

        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+        cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
            .await
            .unwrap();

@@ -321,32 +258,18 @@ mod tests {
        assert_eq!(*cell.get().unwrap(), reinit);
    }

-    #[test]
-    fn reinit_with_deinit_permit() {
-        let cell = Arc::new(OnceCell::new(42));
-
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
-
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
-        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
-    }
-
    #[tokio::test]
    async fn initialization_attemptable_until_ok() {
        let cell = OnceCell::default();

        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
+            cell.get_or_init(|| async { Err("whatever error") })
                .await
                .unwrap_err();
        }

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("finally success") })
            .await
            .unwrap();
        assert_eq!(*g, "finally success");
@@ -358,11 +281,11 @@ mod tests {

        let barrier = tokio::sync::Barrier::new(2);

-        let initializer = cell.get_or_init(|permit| async {
+        let initializer = cell.get_or_init(|| async {
            barrier.wait().await;
            futures::future::pending::<()>().await;

-            Ok::<_, Infallible>(("never reached", permit))
+            Ok::<_, Infallible>("never reached")
        });

        tokio::select! {
@@ -375,7 +298,7 @@ mod tests {
        assert!(cell.get().is_none());

        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
            .await
            .unwrap();
        assert_eq!(*g, "now initialized");
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl Default for FileCacheConfig {
-    fn default() -> Self {
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
-}

-impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
+            }
+
            state.filecache = Some(file_cache);
        }

@@ -233,11 +239,17 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -261,7 +273,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

            let current = last_history.avg_non_reclaimable;

@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }

            if actual_usage != expected_usage {
                warn!(
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -68,7 +68,6 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
-pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -1,53 +0,0 @@
-[package]
-name = "pageserver_compaction"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[features]
-default = []
-
-[dependencies]
-anyhow.workspace = true
-async-compression.workspace = true
-async-stream.workspace = true
-async-trait.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
-clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
-futures.workspace = true
-git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
-itertools.workspace = true
-once_cell.workspace = true
-pin-project-lite.workspace = true
-rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
-tracing.workspace = true
-tracing-error.workspace = true
-tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
-[dev-dependencies]
-criterion.workspace = true
-hex-literal.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -1,49 +0,0 @@
-# TODO
-
- If the key space can be perfectly partitioned at some key, perform planning on each
-  partition separately. For example, if we are compacting a level with layers like this:
-
-              :
-  +--+ +----+ :  +------+
-  |  | |    | :  |      |
-  +--+ +----+ :  +------+
-              :
-  +-----+ +-+ : +--------+
-  |     | | | : |        |
-  +-----+ +-+ : +--------+
-              :
-
-  At the dotted line, there is a natural split in the key space, such that all
-  layers are either on the left or the right of it. We can compact the
-  partitions separately.  We could choose to create image layers for one
-  partition but not the other one, for example.
-
- All the layers don't have to be exactly the same size, we can choose to cut a
-  layer short or stretch it a little larger than the target size, if it helps
-  the overall system. We can help perfect partitions (see previous bullet point)
-  to happen more frequently, by choosing the cut points wisely. For example, try
-  to cut layers at boundaries of underlying image layers. And "snap to grid",
-  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
-
- Avoid rewriting layers when we'd just create an identical layer to an input
-  layer.
-
- Parallelism. The code is already split up into planning and execution, so that
-  we first split up the compaction work into "Jobs", and then execute them.
-  It would be straightforward to execute multiple jobs in parallel.
-
- Materialize extra pages in delta layers during compaction. This would reduce
-  read amplification. There has been the idea of partial image layers. Materializing
-  extra pages in the delta layers achieve the same goal, without introducing a new
-  concept.
-
-## Simulator
-
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
-  spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
-  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
-  a MockRecord that is newer than PITR horizon is completely dropped. That would
-  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,214 +0,0 @@
-use clap::{Parser, Subcommand};
-use pageserver_compaction::simulator::MockTimeline;
-use rand::Rng;
-use std::io::Write;
-use std::path::{Path, PathBuf};
-use std::sync::OnceLock;
-
-use utils::project_git_version;
-
-project_git_version!(GIT_VERSION);
-
-#[derive(Parser)]
-#[command(
-    version = GIT_VERSION,
-    about = "Neon Pageserver compaction simulator",
-    long_about = "A developer tool to visualize and test compaction"
-)]
-#[command(propagate_version = true)]
-struct CliOpts {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Subcommand)]
-enum Commands {
-    RunSuite,
-    Simulate(SimulateCmd),
-}
-
-#[derive(Clone, clap::ValueEnum)]
-enum Distribution {
-    Uniform,
-    HotCold,
-}
-
-/// Read and update pageserver metadata file
-#[derive(Parser)]
-struct SimulateCmd {
-    distribution: Distribution,
-
-    /// Number of records to digest
-    num_records: u64,
-    /// Record length
-    record_len: u64,
-
-    // Logical database size in MB
-    logical_size: u64,
-}
-
-async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-
-    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
-    //let key_range = u64::MIN..u64::MAX;
-    println!(
-        "starting simulation with key range {:016X}-{:016X}",
-        key_range.start, key_range.end
-    );
-
-    // helper function to print progress indicator
-    let print_progress = |i| -> anyhow::Result<()> {
-        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
-            print!(
-                "\ringested {} / {} records, {} MiB / {} MiB...",
-                i + 1,
-                cmd.num_records,
-                (i + 1) * cmd.record_len / (1_000_000),
-                cmd.num_records * cmd.record_len / (1_000_000),
-            );
-            std::io::stdout().flush()?;
-        }
-        Ok(())
-    };
-
-    match cmd.distribution {
-        Distribution::Uniform => {
-            for i in 0..cmd.num_records {
-                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-        Distribution::HotCold => {
-            let splitpoint = key_range.end / 10;
-            let hot_key_range = 0..splitpoint;
-            let cold_key_range = splitpoint..key_range.end;
-
-            for i in 0..cmd.num_records {
-                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
-                    &hot_key_range
-                } else {
-                    &cold_key_range
-                };
-                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-    }
-    println!("done!");
-    executor.flush_l0();
-    executor.compact_if_needed().await?;
-    let stats = executor.print_stats()?;
-
-    // Print the stats to stdout, and also to a file
-    print!("{}", stats);
-    std::fs::write(results_path.join("stats.txt"), stats)?;
-
-    let animation_path = results_path.join("compaction-animation.html");
-    executor.draw_history(std::fs::File::create(&animation_path)?)?;
-    println!(
-        "animation: file://{}",
-        animation_path.canonicalize()?.display()
-    );
-
-    Ok(())
-}
-
-async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
-    std::fs::create_dir(results_path)?;
-
-    set_log_file(File::create(results_path.join("log"))?);
-    let result = simulate(workload, results_path).await;
-    set_log_stdout();
-    result
-}
-
-async fn run_suite() -> anyhow::Result<()> {
-    let top_results_path = PathBuf::from(format!(
-        "compaction-suite-results.{}",
-        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
-    ));
-    std::fs::create_dir(&top_results_path)?;
-
-    let workload = SimulateCmd {
-        distribution: Distribution::Uniform,
-        // Generate 20 GB of WAL
-        record_len: 1_000,
-        num_records: 20_000_000,
-        // Logical size 5 GB
-        logical_size: 5_000,
-    };
-
-    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
-
-    println!(
-        "All tests finished. Results in {}",
-        top_results_path.display()
-    );
-    Ok(())
-}
-
-use std::fs::File;
-use std::io::Stdout;
-use std::sync::Mutex;
-use tracing_subscriber::fmt::writer::EitherWriter;
-use tracing_subscriber::fmt::MakeWriter;
-
-static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
-fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
-    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
-}
-
-fn set_log_file(f: File) {
-    *get_log_output().lock().unwrap() = EitherWriter::A(f);
-}
-
-fn set_log_stdout() {
-    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
-}
-
-fn init_logging() -> anyhow::Result<()> {
-    // We fall back to printing all spans at info-level or above if
-    // the RUST_LOG environment variable is not set.
-    let rust_log_env_filter = || {
-        tracing_subscriber::EnvFilter::try_from_default_env()
-            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
-    };
-
-    // NB: the order of the with() calls does not matter.
-    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
-    use tracing_subscriber::prelude::*;
-    tracing_subscriber::registry()
-        .with({
-            let log_layer = tracing_subscriber::fmt::layer()
-                .with_target(false)
-                .with_ansi(false)
-                .with_writer(|| get_log_output().make_writer());
-            log_layer.with_filter(rust_log_env_filter())
-        })
-        .init();
-
-    Ok(())
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = CliOpts::parse();
-
-    init_logging()?;
-
-    match cli.command {
-        Commands::Simulate(cmd) => {
-            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
-        }
-        Commands::RunSuite => {
-            run_suite().await?;
-        }
-    };
-    Ok(())
-}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -1,870 +0,0 @@
-//! # Tiered compaction algorithm.
-//!
-//! Read all the input delta files, and write a new set of delta files that
-//! include all the input WAL records. See retile_deltas().
-//!
-//! In a "normal" LSM tree, you get to remove any values that are overwritten by
-//! later values, but in our system, we keep all the history. So the reshuffling
-//! doesn't remove any garbage, it just reshuffles the records to reduce read
-//! amplification, i.e. the number of files that you need to access to find the
-//! WAL records for a given key.
-//!
-//! If the new delta files would be very "narrow", i.e. each file would cover
-//! only a narrow key range, then we create a new set of image files
-//! instead. The current threshold is that if the estimated total size of the
-//! image layers is smaller than the size of the deltas, then we create image
-//! layers. That amounts to 2x storage amplification, and it means that the
-//! distance of image layers in LSN dimension is roughly equal to the logical
-//! database size. For example, if the logical database size is 10 GB, we would
-//! generate new image layers every 10 GB of WAL.
-//!
-use futures::StreamExt;
-use tracing::{debug, info};
-
-use std::collections::{HashSet, VecDeque};
-use std::ops::Range;
-
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
-use crate::interface::*;
-use utils::lsn::Lsn;
-
-use crate::identify_levels::identify_level;
-
-/// Main entry point to compaction.
-///
-/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
-/// everything below that point, that needs compaction. The cutoff LSN must
-/// partition the layers so that there are no layers that span across that
-/// LSN. To start compaction at the top of the tree, pass the end LSN of the
-/// written last L0 layer.
-pub async fn compact_tiered<E: CompactionJobExecutor>(
-    executor: &mut E,
-    end_lsn: Lsn,
-    target_file_size: u64,
-    fanout: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
-    // Start at L0
-    let mut current_level_no = 0;
-    let mut current_level_target_height = target_file_size;
-    loop {
-        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
-        let all_layers = executor
-            .get_layers(
-                &(E::Key::MIN..E::Key::MAX),
-                &(Lsn(u64::MIN)..end_lsn + 1),
-                ctx,
-            )
-            .await?;
-        info!(
-            "Compacting L{}, total # of layers: {}",
-            current_level_no,
-            all_layers.len()
-        );
-
-        // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
-        // size. That should give us enough slop that if we created a slightly
-        // oversized L0 layer, e.g. because flushing the in-memory layer was
-        // delayed for some reason, we don't consider the oversized layer to
-        // belong to L1. But not too much slop, that we don't accidentally
-        // "skip" levels.
-        let max_height = (current_level_target_height as f64 * 1.75) as u64;
-        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
-            break;
-        };
-
-        // Calculate the height of this level. If the # of tiers exceeds the
-        // fanout parameter, it's time to compact it.
-        let depth = level.depth();
-        info!(
-            "Level {} identified as LSN range {}-{}: depth {}",
-            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
-        );
-        for l in &level.layers {
-            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
-        }
-        if depth < fanout {
-            debug!(
-                level = current_level_no,
-                depth = depth,
-                fanout,
-                "too few deltas to compact"
-            );
-            break;
-        }
-
-        compact_level(
-            &level.lsn_range,
-            &level.layers,
-            executor,
-            target_file_size,
-            ctx,
-        )
-        .await?;
-        if target_file_size == u64::MAX {
-            break;
-        }
-        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
-    }
-    Ok(())
-}
-
-async fn compact_level<E: CompactionJobExecutor>(
-    lsn_range: &Range<Lsn>,
-    layers: &[E::Layer],
-    executor: &mut E,
-    target_file_size: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<bool> {
-    let mut layer_fragments = Vec::new();
-    for l in layers {
-        layer_fragments.push(LayerFragment::new(l.clone()));
-    }
-
-    let mut state = LevelCompactionState {
-        target_file_size,
-        _lsn_range: lsn_range.clone(),
-        layers: layer_fragments,
-        jobs: Vec::new(),
-        job_queue: Vec::new(),
-        next_level: false,
-        executor,
-    };
-
-    let first_job = CompactionJob {
-        key_range: E::Key::MIN..E::Key::MAX,
-        lsn_range: lsn_range.clone(),
-        strategy: CompactionStrategy::Divide,
-        input_layers: state
-            .layers
-            .iter()
-            .enumerate()
-            .map(|i| LayerId(i.0))
-            .collect(),
-        completed: false,
-    };
-
-    state.jobs.push(first_job);
-    state.job_queue.push(JobId(0));
-    state.execute(ctx).await?;
-
-    info!(
-        "compaction completed! Need to process next level: {}",
-        state.next_level
-    );
-
-    Ok(state.next_level)
-}
-
-/// Blackboard that keeps track of the state of all the jobs and work remaining
-struct LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    // parameters
-    target_file_size: u64,
-
-    _lsn_range: Range<Lsn>,
-    layers: Vec<LayerFragment<E>>,
-
-    // job queue
-    jobs: Vec<CompactionJob<E>>,
-    job_queue: Vec<JobId>,
-
-    /// If false, no need to compact levels below this
-    next_level: bool,
-
-    /// Interface to the outside world
-    executor: &'a mut E,
-}
-
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct LayerId(usize);
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct JobId(usize);
-
-struct PendingJobSet {
-    pending: HashSet<JobId>,
-    completed: HashSet<JobId>,
-}
-
-impl PendingJobSet {
-    fn new() -> Self {
-        PendingJobSet {
-            pending: HashSet::new(),
-            completed: HashSet::new(),
-        }
-    }
-
-    fn complete_job(&mut self, job_id: JobId) {
-        self.pending.remove(&job_id);
-        self.completed.insert(job_id);
-    }
-
-    fn all_completed(&self) -> bool {
-        self.pending.is_empty()
-    }
-}
-
-// When we decide to rewrite a set of layers, LayerFragment is used to keep
-// track which new layers supersede an old layer. When all the stakeholder jobs
-// have completed, this layer can be deleted.
-struct LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    layer: E::Layer,
-
-    // If we will write new layers to replace this one, this keeps track of the
-    // jobs that need to complete before this layer can be deleted. As the jobs
-    // complete, they are moved from 'pending' to 'completed' set. Once the
-    // 'pending' set becomes empty, the layer can be deleted.
-    //
-    // If None, this layer is not rewritten and must not be deleted.
-    deletable_after: Option<PendingJobSet>,
-
-    deleted: bool,
-}
-
-impl<E> LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    fn new(layer: E::Layer) -> Self {
-        LayerFragment {
-            layer,
-            deletable_after: None,
-            deleted: false,
-        }
-    }
-}
-
-#[derive(PartialEq)]
-enum CompactionStrategy {
-    Divide,
-    CreateDelta,
-    CreateImage,
-}
-
-#[allow(dead_code)] // Todo
-struct CompactionJob<E: CompactionJobExecutor> {
-    key_range: Range<E::Key>,
-    lsn_range: Range<Lsn>,
-
-    strategy: CompactionStrategy,
-
-    input_layers: Vec<LayerId>,
-
-    completed: bool,
-}
-
-impl<'a, E> LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    /// Main loop of the executor.
-    ///
-    /// In each iteration, we take the next job from the queue, and execute it.
-    /// The execution might add new jobs to the queue. Keep going until the
-    /// queue is empty.
-    ///
-    /// Initially, the job queue consists of one Divide job over the whole
-    /// level. On first call, it is divided into smaller jobs.
-    ///
-    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
-        while let Some(next_job_id) = self.job_queue.pop() {
-            info!("executing job {}", next_job_id.0);
-            self.execute_job(next_job_id, ctx).await?;
-        }
-
-        // all done!
-        Ok(())
-    }
-
-    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        match job.strategy {
-            CompactionStrategy::Divide => {
-                self.divide_job(job_id, ctx).await?;
-                Ok(())
-            }
-            CompactionStrategy::CreateDelta => {
-                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-                let mut layer_ids: Vec<LayerId> = Vec::new();
-                for layer_id in &job.input_layers {
-                    let layer = &self.layers[layer_id.0].layer;
-                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
-                        deltas.push(dl.clone());
-                        layer_ids.push(*layer_id);
-                    }
-                }
-
-                self.executor
-                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // did we complete any fragments?
-                for layer_id in layer_ids {
-                    let l = &mut self.layers[layer_id.0];
-                    if let Some(deletable_after) = l.deletable_after.as_mut() {
-                        deletable_after.complete_job(job_id);
-                        if deletable_after.all_completed() {
-                            self.executor.delete_layer(&l.layer, ctx).await?;
-                            l.deleted = true;
-                        }
-                    }
-                }
-
-                self.next_level = true;
-
-                Ok(())
-            }
-            CompactionStrategy::CreateImage => {
-                self.executor
-                    .create_image(job.lsn_range.end, &job.key_range, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // TODO: we could check if any layers < PITR horizon became deletable
-                Ok(())
-            }
-        }
-    }
-
-    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
-        let job_id = JobId(self.jobs.len());
-        self.jobs.push(job);
-        self.job_queue.push(job_id);
-        job_id
-    }
-
-    ///
-    /// Take a partition of the key space, and decide how to compact it.
-    ///
-    /// TODO: Currently, this is called exactly once for the level, and we
-    /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
-    /// the key space, and make the decision separately for each partition.
-    ///
-    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Check for dummy cases
-        if job.input_layers.is_empty() {
-            return Ok(());
-        }
-
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Would it be better to create images for this partition?
-        // Decide based on the average density of the level
-        let keyspace_size = keyspace_total_size(
-            &self
-                .executor
-                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-                .await?,
-        ) * 8192;
-
-        let wal_size = job
-            .input_layers
-            .iter()
-            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
-            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
-            .sum::<u64>();
-        if keyspace_size < wal_size {
-            // seems worth it
-            info!(
-                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
-                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
-            );
-            self.cover_with_images(job_id, ctx).await
-        } else {
-            // do deltas
-            info!(
-                "coverage not worth it, keyspace_size {}, wal_size {}",
-                keyspace_size, wal_size
-            );
-            self.retile_deltas(job_id, ctx).await
-        }
-    }
-
-    // LSN
-    //  ^
-    //  |
-    //  |                          ###|###|#####
-    //  | +--+-----+--+            +--+-----+--+
-    //  | |  |     |  |            |  |     |  |
-    //  | +--+--+--+--+            +--+--+--+--+
-    //  | |     |     |            |     |     |
-    //  | +---+-+-+---+     ==>    +---+-+-+---+
-    //  | |   |   |   |            |   |   |   |
-    //  | +---+-+-++--+            +---+-+-++--+
-    //  | |     |  |  |            |     |  |  |
-    //  | +-----+--+--+            +-----+--+--+
-    //  |
-    //  +--------------> key
-    //
-    async fn cover_with_images(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // XXX: do we still need the "holes" stuff?
-
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let keyspace = self
-            .executor
-            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-            .await?;
-
-        let mut window = KeyspaceWindow::new(
-            E::Key::MIN..E::Key::MAX,
-            keyspace,
-            self.target_file_size / 8192,
-        );
-        while let Some(key_range) = window.choose_next_image() {
-            new_jobs.push(CompactionJob::<E> {
-                key_range,
-                lsn_range: job.lsn_range.clone(),
-                strategy: CompactionStrategy::CreateImage,
-                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
-                completed: false,
-            });
-        }
-
-        for j in new_jobs.into_iter().rev() {
-            let _job_id = self.push_job(j);
-
-            // TODO: image layers don't let us delete anything. unless < PITR horizon
-            //let j = &self.jobs[job_id.0];
-            // for layer_id in j.input_layers.iter() {
-            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
-            //}
-        }
-
-        Ok(())
-    }
-
-    // Merge the contents of all the input delta layers into a new set
-    // of delta layers, based on the current partitioning.
-    //
-    // We split the new delta layers on the key dimension. We iterate through
-    // the key space, and for each key, check if including the next key to the
-    // current output layer we're building would cause the layer to become too
-    // large. If so, dump the current output layer and start new one.  It's
-    // possible that there is a single key with so many page versions that
-    // storing all of them in a single layer file would be too large. In that
-    // case, we also split on the LSN dimension.
-    //
-    // LSN
-    //  ^
-    //  |
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    //
-    // If one key (X) has a lot of page versions:
-    //
-    // LSN
-    //  ^
-    //  |                                 (X)
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  +--+  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  +--+  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    // TODO: this actually divides the layers into fixed-size chunks, not
-    // based on the partitioning.
-    //
-    // TODO: we should also opportunistically materialize and
-    // garbage collect what we can.
-    async fn retile_deltas(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Sweep the key space left to right, running an estimate of how much
-        // disk size and keyspace we have accumulated
-        //
-        // Once the disk size reaches the target threshold, stop and think.
-        // If we have accumulated only a narrow band of keyspace, create an
-        // image layer. Otherwise write a delta layer.
-
-        // FIXME: deal with the case of lots of values for same key
-
-        // FIXME: we are ignoring images here. Did we already divide the work
-        // so that we won't encounter them here?
-
-        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-        for layer_id in &job.input_layers {
-            let l = &self.layers[layer_id.0];
-            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
-                deltas.push(dl.clone());
-            }
-        }
-        // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
-        let mut all_in_window: bool = false;
-        let mut window = Window::new();
-        loop {
-            if all_in_window && window.elems.is_empty() {
-                // All done!
-                break;
-            }
-            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
-            {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
-                    all_in_window = true;
-                }
-            }
-        }
-
-        // All the input files are rewritten. Set up the tracking for when they can
-        // be deleted.
-        for layer_id in job.input_layers.iter() {
-            let l = &mut self.layers[layer_id.0];
-            assert!(l.deletable_after.is_none());
-            l.deletable_after = Some(PendingJobSet::new());
-        }
-        for j in new_jobs.into_iter().rev() {
-            let job_id = self.push_job(j);
-            let j = &self.jobs[job_id.0];
-            for layer_id in j.input_layers.iter() {
-                self.layers[layer_id.0]
-                    .deletable_after
-                    .as_mut()
-                    .unwrap()
-                    .pending
-                    .insert(job_id);
-            }
-        }
-
-        Ok(())
-    }
-}
-
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
-struct KeyspaceWindow<K> {
-    head: KeyspaceWindowHead<K>,
-
-    start_pos: KeyspaceWindowPos<K>,
-}
-struct KeyspaceWindowHead<K> {
-    // overall key range to cover
-    key_range: Range<K>,
-
-    keyspace: Vec<Range<K>>,
-    target_keysize: u64,
-}
-
-#[derive(Clone)]
-struct KeyspaceWindowPos<K> {
-    end_key: K,
-
-    keyspace_idx: usize,
-
-    accum_keysize: u64,
-}
-impl<K: CompactionKey> KeyspaceWindowPos<K> {
-    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
-        self.keyspace_idx == w.keyspace.len()
-    }
-
-    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
-        while self.accum_keysize < max_size && !self.reached_end(w) {
-            let curr_range = &w.keyspace[self.keyspace_idx];
-            if self.end_key < curr_range.start {
-                // skip over any unused space
-                self.end_key = curr_range.start;
-            }
-
-            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
-            if (self.accum_keysize + distance as u64) < max_size {
-                // oh yeah, it fits
-                self.end_key = curr_range.end;
-                self.keyspace_idx += 1;
-                self.accum_keysize += distance as u64;
-            } else {
-                // advance within the range
-                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
-                if (self.accum_keysize + distance as u64) < max_size {
-                    self.end_key = skip_key;
-                    self.accum_keysize += distance as u64;
-                } else {
-                    self.end_key = self.end_key.next();
-                    self.accum_keysize += 1;
-                }
-            }
-        }
-    }
-}
-
-impl<K> KeyspaceWindow<K>
-where
-    K: CompactionKey,
-{
-    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
-        assert!(keyspace.first().unwrap().start >= key_range.start);
-
-        let start_key = key_range.start;
-        let start_pos = KeyspaceWindowPos::<K> {
-            end_key: start_key,
-            keyspace_idx: 0,
-            accum_keysize: 0,
-        };
-        Self {
-            head: KeyspaceWindowHead::<K> {
-                key_range,
-                keyspace,
-                target_keysize,
-            },
-            start_pos,
-        }
-    }
-
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
-        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
-            // we've reached the end
-            return None;
-        }
-
-        let mut next_pos = self.start_pos.clone();
-        next_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + self.head.target_keysize,
-        );
-
-        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
-        // 1.25x target size
-        let mut end_pos = next_pos.clone();
-        end_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-        );
-        if end_pos.reached_end(&self.head) {
-            // gobble up any unused keyspace between the last used key and end of the range
-            assert!(end_pos.end_key <= self.head.key_range.end);
-            end_pos.end_key = self.head.key_range.end;
-            next_pos = end_pos;
-        }
-
-        let start_key = self.start_pos.end_key;
-        self.start_pos = next_pos;
-        Some(start_key..self.start_pos.end_key)
-    }
-}
-
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
-// Take previous partitioning, based on the image layers below.
-//
-// Candidate is at the front:
-//
-// Consider stretching an image layer to next divider? If it's close enough,
-// that's the image candidate
-//
-// If it's too far, consider splitting at a reasonable point
-//
-// Is the image candidate smaller than the equivalent delta? If so,
-// split off the image. Otherwise, split off one delta.
-// Try to snap off the delta at a reasonable point
-
-struct WindowElement<K> {
-    start_key: K, // inclusive
-    last_key: K,  // inclusive
-    accum_size: u64,
-}
-struct Window<K> {
-    elems: VecDeque<WindowElement<K>>,
-
-    // last key that was split off, inclusive
-    splitoff_key: Option<K>,
-    splitoff_size: u64,
-}
-
-impl<K> Window<K>
-where
-    K: CompactionKey,
-{
-    fn new() -> Self {
-        Self {
-            elems: VecDeque::new(),
-            splitoff_key: None,
-            splitoff_size: 0,
-        }
-    }
-
-    fn feed(&mut self, key: K, size: u64) {
-        let last_size;
-        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
-            last_size = last.accum_size;
-        } else {
-            last_size = 0;
-        }
-        // This is a new key.
-        let elem = WindowElement {
-            start_key: key,
-            last_key: key,
-            accum_size: last_size + size,
-        };
-        self.elems.push_back(elem);
-    }
-
-    fn remain_size(&self) -> u64 {
-        self.elems.back().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn peek_size(&self) -> u64 {
-        self.elems.front().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn commit_upto(&mut self, mut upto: usize) {
-        while upto > 1 {
-            let popped = self.elems.pop_front().unwrap();
-            self.elems.front_mut().unwrap().start_key = popped.start_key;
-            upto -= 1;
-        }
-    }
-
-    fn find_size_split(&self, target_size: u64) -> usize {
-        self.elems
-            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
-    }
-
-    fn pop(&mut self) {
-        let first = self.elems.pop_front().unwrap();
-        self.splitoff_size = first.accum_size;
-
-        self.splitoff_key = Some(first.last_key);
-    }
-
-    // the difference between delta and image is that an image covers
-    // any unused keyspace before and after, while a delta tries to
-    // minimize that. TODO: difference not implemented
-    fn pop_delta(&mut self) -> Range<K> {
-        let first = self.elems.front().unwrap();
-        let key_range = first.start_key..first.last_key.next();
-
-        self.pop();
-        key_range
-    }
-
-    // Prerequisite: we have enough input in the window
-    //
-    // On return None, the caller should feed more data and call again
-    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
-        if has_more && self.elems.is_empty() {
-            // Starting up
-            return None;
-        }
-
-        // If we still have an undersized candidate, just keep going
-        while self.peek_size() < target_size {
-            if self.elems.len() > 1 {
-                self.commit_upto(2);
-            } else if has_more {
-                return None;
-            } else {
-                break;
-            }
-        }
-
-        // Ensure we have enough input in the window to make a good decision
-        if has_more && self.remain_size() < target_size * 5 / 4 {
-            return None;
-        }
-
-        // The candidate on the front is now large enough, for a delta.
-        // And we have enough data in the window to decide.
-
-        // If we're willing to stretch it up to 1.25 target size, could we
-        // gobble up the rest of the work? This avoids creating very small
-        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
-            self.commit_upto(self.elems.len());
-        } else {
-            let delta_split_at = self.find_size_split(target_size);
-            self.commit_upto(delta_split_at);
-
-            // If it's still not large enough, request the caller to fill the window
-            if self.elems.len() == 1 && has_more {
-                return None;
-            }
-        }
-        Some(self.pop_delta())
-    }
-}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -1,251 +0,0 @@
-//! This file contains generic utility functions over the interface types,
-//! which could be handy for any compaction implementation.
-use crate::interface::*;
-
-use futures::future::BoxFuture;
-use futures::{Stream, StreamExt};
-use itertools::Itertools;
-use pin_project_lite::pin_project;
-use std::cmp::Ord;
-use std::collections::BinaryHeap;
-use std::collections::VecDeque;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::Poll;
-
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
-where
-    K: CompactionKey,
-{
-    let mut total = 0;
-    for r in keyspace.iter() {
-        total += K::key_range_size(r) as u64;
-    }
-    total
-}
-
-pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-    !(a.end <= b.start || b.end <= a.start)
-}
-
-pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
-    let x = std::mem::take(a);
-    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
-        .into_iter()
-        .kmerge_by(|a, b| a.start < b.start);
-    let mut ranges = Vec::new();
-    if let Some(first) = all_ranges_iter.next() {
-        let (mut start, mut end) = (first.start, first.end);
-
-        for r in all_ranges_iter {
-            assert!(r.start >= start);
-            if r.start > end {
-                ranges.push(start..end);
-                start = r.start;
-                end = r.end;
-            } else if r.end > end {
-                end = r.end;
-            }
-        }
-        ranges.push(start..end);
-    }
-    *a = ranges
-}
-
-pub fn intersect_keyspace<K: Ord + Clone + Copy>(
-    a: &CompactionKeySpace<K>,
-    r: &Range<K>,
-) -> CompactionKeySpace<K> {
-    let mut ranges: Vec<Range<K>> = Vec::new();
-
-    for x in a.iter() {
-        if x.end <= r.start {
-            continue;
-        }
-        if x.start >= r.end {
-            break;
-        }
-        ranges.push(x.clone())
-    }
-
-    // trim the ends
-    if let Some(first) = ranges.first_mut() {
-        first.start = std::cmp::max(first.start, r.start);
-    }
-    if let Some(last) = ranges.last_mut() {
-        last.end = std::cmp::min(last.end, r.end);
-    }
-    ranges
-}
-
-/// Create a stream that iterates through all DeltaEntrys among all input
-/// layers, in key-lsn order.
-///
-/// This is public because the create_delta() implementation likely wants to use this too
-/// TODO: move to a more shared place
-pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
-    layers: &'a [E::DeltaLayer],
-    ctx: &'a E::RequestContext,
-) -> MergeDeltaKeys<'a, E> {
-    // Use a binary heap to merge the layers. Each input layer is initially
-    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
-    // the layer's key range as the key. The first time a layer reaches the top
-    // of the heap, all the keys of the layer are loaded into a sorted vector.
-    //
-    // This helps to keep the memory usage reasonable: we only need to hold in
-    // memory the DeltaEntrys of the layers that overlap with the "current" key.
-    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
-    for l in layers {
-        heap.push(LazyLoadLayer::Unloaded(l));
-    }
-    MergeDeltaKeys {
-        heap,
-        ctx,
-        load_future: None,
-    }
-}
-
-enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
-    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
-    Unloaded(&'a E::DeltaLayer),
-}
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
-        match self {
-            Self::Loaded(entries) => entries.front().unwrap().key(),
-            Self::Unloaded(dl) => dl.key_range().start,
-        }
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        // reverse order so that we get a min-heap
-        other.key().partial_cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
-    fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
-
-type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
-
-// Stream returned by `merge_delta_keys`
-pin_project! {
-#[allow(clippy::type_complexity)]
-pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
-    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
-
-    #[pin]
-    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
-
-    ctx: &'a E::RequestContext,
-}
-}
-
-impl<'a, E> Stream for MergeDeltaKeys<'a, E>
-where
-    E: CompactionJobExecutor + 'a,
-{
-    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
-
-    fn poll_next(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
-        let mut this = self.project();
-        loop {
-            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
-                // We are waiting for loading the keys to finish
-                match load_future.as_mut().poll(cx) {
-                    Poll::Ready(Ok(entries)) => {
-                        this.load_future.set(None);
-                        *this.heap.peek_mut().unwrap() =
-                            LazyLoadLayer::Loaded(VecDeque::from(entries));
-                    }
-                    Poll::Ready(Err(e)) => {
-                        return Poll::Ready(Some(Err(e)));
-                    }
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                }
-            }
-
-            // If the topmost layer in the heap hasn't been loaded yet, start
-            // loading it. Otherwise return the next entry from it and update
-            // the layer's position in the heap (this decreaseKey operation is
-            // performed implicitly when `top` is dropped).
-            if let Some(mut top) = this.heap.peek_mut() {
-                match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
-                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(Box::pin(fut)));
-                        continue;
-                    }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
-                        let result = entries.pop_front().unwrap();
-                        if entries.is_empty() {
-                            std::collections::binary_heap::PeekMut::pop(top);
-                        }
-                        return Poll::Ready(Some(Ok(result)));
-                    }
-                }
-            } else {
-                return Poll::Ready(None);
-            }
-        }
-    }
-}
-
-// Accumulate values at key boundaries
-pub struct KeySize<K> {
-    pub key: K,
-    pub num_values: u64,
-    pub size: u64,
-}
-
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
-where
-    K: Eq,
-    I: Stream<Item = Result<D, E>>,
-    D: CompactionDeltaEntry<'a, K>,
-{
-    async_stream::try_stream! {
-        // Initialize the state from the first value
-        let mut input = std::pin::pin!(input);
-
-        if let Some(first) = input.next().await {
-            let first = first?;
-            let mut accum: KeySize<K> = KeySize {
-                key: first.key(),
-                num_values: 1,
-                size: first.size(),
-            };
-            while let Some(this) = input.next().await {
-                let this = this?;
-                if this.key() == accum.key {
-                    accum.size += this.size();
-                    accum.num_values += 1;
-                } else {
-                    yield accum;
-                    accum = KeySize {
-                        key: this.key(),
-                        num_values: 1,
-                        size: this.size(),
-                    };
-                }
-            }
-            yield accum;
-        }
-    }
-}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,376 +0,0 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
-//! compaction, a level is compacted when it has accumulated more than N tiers,
-//! forming one tier on the next level.
-//!
-//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
-//! we identify them by looking at the shapes of the layers. It's an easy task
-//! for a human, but it's not straightforward to come up with the exact
-//! rules. Especially if there are cases like interrupted, half-finished
-//! compactions, or highly skewed data distributions that have let us "skip"
-//! some levels. It's not critical to classify all cases correctly; at worst we
-//! delay some compaction work, and suffer from more read amplification, or we
-//! perform some unnecessary compaction work.
-//!
-//! `identify_level` performs that shape-matching.
-//!
-//! It returns a Level struct, which has `depth()` function to count the number
-//! of "tiers" in the level. The tier count is the max depth of stacked layers
-//! within the level. That's a good measure, because the point of compacting is
-//! to reduce read amplification, and the depth is what determines that.
-//!
-//! One interesting effect of this is that if we generate very small delta
-//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
-//! because they reach the target size, the L0 compaction will combine them to
-//! one larger file. But if the combined file is still smaller than the target
-//! file size, the file will still be considered to be part of L0 at the next
-//! iteration.
-
-use anyhow::bail;
-use std::collections::BTreeSet;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-use crate::interface::*;
-
-use tracing::{info, trace};
-
-pub struct Level<L> {
-    pub lsn_range: Range<Lsn>,
-    pub layers: Vec<L>,
-}
-
-/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
-/// no layers that cross the boundary LSN.
-///
-/// A further restriction is that all layers in the returned partition cover at
-/// most 'lsn_max_size' LSN bytes.
-pub async fn identify_level<K, L>(
-    all_layers: Vec<L>,
-    end_lsn: Lsn,
-    lsn_max_size: u64,
-) -> anyhow::Result<Option<Level<L>>>
-where
-    K: CompactionKey,
-    L: CompactionLayer<K> + Clone,
-{
-    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
-    let mut layers = Vec::new();
-    for l in all_layers {
-        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
-            // shouldn't happen. Indicates that the caller passed a bogus
-            // end_lsn.
-            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
-        }
-        // include image layers sitting exacty at `end_lsn`.
-        let is_image = !l.is_delta();
-        if (is_image && l.lsn_range().start > end_lsn)
-            || (!is_image && l.lsn_range().start >= end_lsn)
-        {
-            continue;
-        }
-        layers.push(l);
-    }
-    // All the remaining layers either belong to this level, or are below it.
-    info!(
-        "identify level at {}, size {}, num layers below: {}",
-        end_lsn,
-        lsn_max_size,
-        layers.len()
-    );
-    if layers.is_empty() {
-        return Ok(None);
-    }
-
-    // Walk the ranges in LSN order.
-    //
-    // ----- end_lsn
-    //  |
-    //  |
-    //  v
-    //
-    layers.sort_by_key(|l| l.lsn_range().end);
-    let mut candidate_start_lsn = end_lsn;
-    let mut candidate_layers: Vec<L> = Vec::new();
-    let mut current_best_start_lsn = end_lsn;
-    let mut current_best_layers: Vec<L> = Vec::new();
-    let mut iter = layers.into_iter();
-    loop {
-        let Some(l) = iter.next_back() else {
-            // Reached end. Accept the last candidate
-            current_best_start_lsn = candidate_start_lsn;
-            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-            break;
-        };
-        trace!(
-            "inspecting {} for candidate {}, current best {}",
-            l.short_id(),
-            candidate_start_lsn,
-            current_best_start_lsn
-        );
-
-        let r = l.lsn_range();
-
-        // Image layers don't restrict our choice of cutoff LSN
-        if l.is_delta() {
-            // Is this candidate workable? In other words, are there any
-            // delta layers that span across this LSN
-            //
-            // Valid:                 Not valid:
-            //  +                     +
-            //  |                     | +
-            //  +  <- candidate       + |   <- candidate
-            //     +                    +
-            //     |
-            //     +
-            if r.end <= candidate_start_lsn {
-                // Hooray, there are no crossing LSNs. And we have visited
-                // through all the layers within candidate..end_lsn. The
-                // current candidate can be accepted.
-                current_best_start_lsn = r.end;
-                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-                candidate_start_lsn = r.start;
-            }
-
-            // Is it small enough to be considered part of this level?
-            if r.end.0 - r.start.0 > lsn_max_size {
-                // Too large, this layer belongs to next level. Stop.
-                trace!(
-                    "too large {}, size {} vs {}",
-                    l.short_id(),
-                    r.end.0 - r.start.0,
-                    lsn_max_size
-                );
-                break;
-            }
-
-            // If this crosses the candidate lsn, push it down.
-            if r.start < candidate_start_lsn {
-                trace!(
-                    "layer {} prevents from stopping at {}",
-                    l.short_id(),
-                    candidate_start_lsn
-                );
-                candidate_start_lsn = r.start;
-            }
-        }
-
-        // Include this layer in our candidate
-        candidate_layers.push(l);
-    }
-
-    Ok(if current_best_start_lsn == end_lsn {
-        // empty level
-        None
-    } else {
-        Some(Level {
-            lsn_range: current_best_start_lsn..end_lsn,
-            layers: current_best_layers,
-        })
-    })
-}
-
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
-impl<L> Level<L> {
-    /// Count the number of deltas stacked on each other.
-    pub fn depth<K>(&self) -> u64
-    where
-        K: CompactionKey,
-        L: CompactionLayer<K>,
-    {
-        let mut events: Vec<Event<K>> = Vec::new();
-        for (idx, l) in self.layers.iter().enumerate() {
-            events.push(Event {
-                key: l.key_range().start,
-                layer_idx: idx,
-                start: true,
-            });
-            events.push(Event {
-                key: l.key_range().end,
-                layer_idx: idx,
-                start: false,
-            });
-        }
-        events.sort_by_key(|e| (e.key, e.start));
-
-        // Sweep the key space left to right. Stop at each distinct key, and
-        // count the number of deltas on top of the highest image at that key.
-        //
-        // This is a little enefficient, as we walk through the active_set on
-        // every key. We could increment/decrement a counter on each step
-        // instead, but that'd require a bit more complex bookkeeping.
-        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
-        let mut max_depth = 0;
-        let mut events_iter = events.iter().peekable();
-        while let Some(e) = events_iter.next() {
-            let l = &self.layers[e.layer_idx];
-            let is_image = !l.is_delta();
-
-            // update the active set
-            if e.start {
-                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
-            } else {
-                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
-            }
-
-            // recalculate depth if this was the last event at this point
-            let more_events_at_this_key = events_iter
-                .peek()
-                .map_or(false, |next_e| next_e.key == e.key);
-            if !more_events_at_this_key {
-                let mut active_depth = 0;
-                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
-                    if *is_image {
-                        break;
-                    }
-                    active_depth += 1;
-                }
-                if active_depth > max_depth {
-                    max_depth = active_depth;
-                }
-            }
-        }
-        max_depth
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
-    use std::sync::{Arc, Mutex};
-
-    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
-        MockLayer::Delta(Arc::new(MockDeltaLayer {
-            key_range,
-            lsn_range,
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-            records: vec![],
-        }))
-    }
-
-    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
-        MockLayer::Image(Arc::new(MockImageLayer {
-            key_range,
-            lsn_range: lsn..(lsn + 1),
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-        }))
-    }
-
-    #[tokio::test]
-    async fn test_identify_level() -> anyhow::Result<()> {
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
-        ];
-
-        // All layers fit in the max file size
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 6);
-
-        // Same LSN with smaller max file size. The second layer from the top is larger
-        // and belongs to next level.
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        // Call with a smaller LSN
-        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 2);
-
-        // Call with an LSN that doesn't partition the space
-        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
-        // The files LSN ranges overlap, so even though there are more files that
-        // fit under the file size, they are not included in the level because they
-        // overlap so that we'd need to include the oldest file, too, which is
-        // larger
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
-        // The key ranges don't overlap, so depth is only 1.
-        let layers = vec![
-            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
-            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 1);
-
-        // Staggered. The 1st and 3rd layer don't overlap with each other.
-        let layers = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 2);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_images() -> anyhow::Result<()> {
-        let layers: Vec<MockLayer> = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-            // This covers the same key range as the 2nd delta layer. The depth
-            // in that key range is therefore 0.
-            image(1500..2500, Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 4);
-        assert_eq!(level.depth(), 1);
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -1,152 +0,0 @@
-//! This is what the compaction implementation needs to know about
-//! layers, keyspace etc.
-//!
-//! All the heavy lifting is done by the create_image and create_delta
-//! functions that the implementor provides.
-//!
-use async_trait::async_trait;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-/// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
-pub trait CompactionJobExecutor {
-    // Type system.
-    //
-    // We assume that there are two kinds of layers, deltas and images. The
-    // compaction doesn't distinguish whether they are stored locally or
-    // remotely.
-    //
-    // The keyspace is defined by CompactionKey trait.
-    //
-    type Key: CompactionKey;
-
-    type Layer: CompactionLayer<Self::Key> + Clone;
-    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
-    type ImageLayer: CompactionImageLayer<Self> + Clone;
-
-    // This is passed through to all the interface functions. The compaction
-    // implementation doesn't do anything with it, but it might be useful for
-    // the interface implementation.
-    type RequestContext: CompactionRequestContext;
-
-    // ----
-    // Functions that the planner uses to support its decisions
-    // ----
-
-    /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn: Lsn,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
-
-    /// NB: This is a pretty expensive operation. In the real pageserver
-    /// implementation, it downloads the layer, and keeps it resident
-    /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
-        &self,
-        layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
-
-    // ----
-    // Functions to execute the plan
-    // ----
-
-    /// Create a new image layer, materializing all the values in the key range,
-    /// at given 'lsn'.
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Self::Key>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Create a new delta layer, containing all the values from 'input_layers'
-    /// in the given key and LSN range.
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Self::Key>,
-        input_layers: &[Self::DeltaLayer],
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Delete a layer. The compaction implementation will call this only after
-    /// all the create_image() or create_delta() calls that deletion of this
-    /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
-    /// it is the implemenation's responsibility to track those.
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-}
-
-pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
-    const MIN: Self;
-    const MAX: Self;
-
-    /// Calculate distance between key_range.start and key_range.end.
-    ///
-    /// This returns u32, for compatibility with Repository::key. If the
-    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
-
-    // return "self + 1"
-    fn next(&self) -> Self;
-
-    // return "self + <some decent amount to skip>". The amount to skip
-    // is left to the implementation.
-    // FIXME: why not just "add(u32)" ?  This is hard to use
-    fn skip_some(&self) -> Self;
-}
-
-/// Contiguous ranges of keys that belong to the key space. In key order, and
-/// with no overlap.
-pub type CompactionKeySpace<K> = Vec<Range<K>>;
-
-/// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
-    fn key_range(&self) -> &Range<K>;
-    fn lsn_range(&self) -> &Range<Lsn>;
-
-    fn file_size(&self) -> u64;
-
-    /// For debugging, short human-readable representation of the layer. E.g. filename.
-    fn short_id(&self) -> String;
-
-    fn is_delta(&self) -> bool;
-}
-
-#[async_trait]
-pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
-    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
-    where
-        Self: 'a;
-
-    /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
-        &self,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
-}
-
-pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
-
-pub trait CompactionDeltaEntry<'a, K> {
-    fn key(&self) -> K;
-    fn lsn(&self) -> Lsn;
-    fn size(&self) -> u64;
-}
-
-pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -1,12 +0,0 @@
-// The main module implementing the compaction algorithm
-pub mod compact_tiered;
-pub(crate) mod identify_levels;
-
-// Traits that the caller of the compaction needs to implement
-pub mod interface;
-
-// Utility functions, useful for the implementation
-pub mod helpers;
-
-// A simulator with mock implementations of 'interface'
-pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -1,613 +0,0 @@
-mod draw;
-
-use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
-
-use async_trait::async_trait;
-use futures::StreamExt;
-use rand::Rng;
-use tracing::info;
-
-use utils::lsn::Lsn;
-
-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::Arc;
-use std::sync::Mutex;
-
-use crate::helpers::{merge_delta_keys, overlaps_with};
-
-use crate::interface;
-use crate::interface::CompactionLayer;
-
-//
-// Implementation for the CompactionExecutor interface
-//
-pub struct MockTimeline {
-    // Parameters for the compaction algorithm
-    pub target_file_size: u64,
-    tiers_per_level: u64,
-
-    num_l0_flushes: u64,
-    last_compact_at_flush: u64,
-    last_flush_lsn: Lsn,
-
-    // In-memory layer
-    records: Vec<MockRecord>,
-    total_len: u64,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    // Current keyspace at `end_lsn`. This is updated on every ingested record.
-    keyspace: KeySpace,
-
-    // historic keyspaces
-    old_keyspaces: Vec<(Lsn, KeySpace)>,
-
-    // "on-disk" layers
-    pub live_layers: Vec<MockLayer>,
-
-    num_deleted_layers: u64,
-
-    // Statistics
-    wal_ingested: u64,
-    bytes_written: u64,
-    bytes_deleted: u64,
-    layers_created: u64,
-    layers_deleted: u64,
-
-    // All the events - creation and deletion of files - are collected
-    // in 'history'. It is used to draw the SVG animation at the end.
-    time: u64,
-    history: Vec<draw::LayerTraceEvent>,
-}
-
-type KeySpace = interface::CompactionKeySpace<Key>;
-
-pub struct MockRequestContext {}
-impl interface::CompactionRequestContext for MockRequestContext {}
-
-pub type Key = u64;
-
-impl interface::CompactionKey for Key {
-    const MIN: Self = u64::MIN;
-    const MAX: Self = u64::MAX;
-
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
-        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
-    }
-
-    fn next(&self) -> Self {
-        self + 1
-    }
-    fn skip_some(&self) -> Self {
-        // round up to next xx
-        self + 100
-    }
-}
-
-#[derive(Clone)]
-pub struct MockRecord {
-    lsn: Lsn,
-    key: Key,
-    len: u64,
-}
-
-impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
-    fn key(&self) -> Key {
-        self.key
-    }
-    fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-    fn size(&self) -> u64 {
-        self.len
-    }
-}
-
-pub struct MockDeltaLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-
-    pub records: Vec<MockRecord>,
-}
-
-impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}-{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-#[async_trait]
-impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
-    type DeltaEntry<'a> = MockRecord;
-
-    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
-        Ok(self.records.clone())
-    }
-}
-
-pub struct MockImageLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-}
-
-impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
-
-impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        false
-    }
-}
-
-impl MockTimeline {
-    pub fn new() -> Self {
-        MockTimeline {
-            target_file_size: 256 * 1024 * 1024,
-            tiers_per_level: 4,
-
-            num_l0_flushes: 0,
-            last_compact_at_flush: 0,
-            last_flush_lsn: Lsn(0),
-
-            records: Vec::new(),
-            total_len: 0,
-            start_lsn: Lsn(1000),
-            end_lsn: Lsn(1000),
-            keyspace: KeySpace::new(),
-
-            old_keyspaces: vec![],
-
-            live_layers: vec![],
-
-            num_deleted_layers: 0,
-
-            wal_ingested: 0,
-            bytes_written: 0,
-            bytes_deleted: 0,
-            layers_created: 0,
-            layers_deleted: 0,
-
-            time: 0,
-            history: Vec::new(),
-        }
-    }
-
-    pub async fn compact(&mut self) -> anyhow::Result<()> {
-        let ctx = MockRequestContext {};
-
-        crate::compact_tiered::compact_tiered(
-            self,
-            self.last_flush_lsn,
-            self.target_file_size,
-            self.tiers_per_level,
-            &ctx,
-        )
-        .await?;
-
-        Ok(())
-    }
-
-    // Ingest one record to the timeline
-    pub fn ingest_record(&mut self, key: Key, len: u64) {
-        self.records.push(MockRecord {
-            lsn: self.end_lsn,
-            key,
-            len,
-        });
-        self.total_len += len;
-        self.end_lsn += len;
-
-        if self.total_len > self.target_file_size {
-            self.flush_l0();
-        }
-    }
-
-    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
-        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
-            self.compact().await?;
-            self.last_compact_at_flush = self.num_l0_flushes;
-        }
-        Ok(())
-    }
-
-    pub fn flush_l0(&mut self) {
-        if self.records.is_empty() {
-            return;
-        }
-
-        let mut records = std::mem::take(&mut self.records);
-        records.sort_by_key(|rec| rec.key);
-
-        let lsn_range = self.start_lsn..self.end_lsn;
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: Key::MIN..Key::MAX,
-            lsn_range: lsn_range.clone(),
-            file_size: self.total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!("flushed L0 layer {}", new_layer.short_id());
-        self.live_layers.push(MockLayer::from(&new_layer));
-
-        // reset L0
-        self.start_lsn = self.end_lsn;
-        self.total_len = 0;
-        self.records = Vec::new();
-
-        self.layers_created += 1;
-        self.bytes_written += new_layer.file_size;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Flush,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        self.num_l0_flushes += 1;
-        self.last_flush_lsn = self.end_lsn;
-    }
-
-    // Ingest `num_records' records to the timeline, with random keys
-    // uniformly distributed in `key_range`
-    pub fn ingest_uniform(
-        &mut self,
-        num_records: u64,
-        len: u64,
-        key_range: &Range<Key>,
-    ) -> anyhow::Result<()> {
-        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
-        let mut rng = rand::thread_rng();
-        for _ in 0..num_records {
-            self.ingest_record(rng.gen_range(key_range.clone()), len);
-            self.wal_ingested += len;
-        }
-        Ok(())
-    }
-
-    pub fn print_stats(&self) -> anyhow::Result<String> {
-        let mut s = String::new();
-
-        writeln!(s, "STATISTICS:")?;
-        writeln!(
-            s,
-            "WAL ingested:   {:>10} MB",
-            self.wal_ingested / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size created:   {:>10} MB",
-            self.bytes_written / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size deleted:   {:>10} MB",
-            self.bytes_deleted / (1024 * 1024)
-        )?;
-        writeln!(s, "files created:     {:>10}", self.layers_created)?;
-        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
-        writeln!(
-            s,
-            "write amp:         {:>10.2}",
-            self.bytes_written as f64 / self.wal_ingested as f64
-        )?;
-        writeln!(
-            s,
-            "storage amp:       {:>10.2}",
-            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
-        )?;
-
-        Ok(s)
-    }
-
-    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
-        draw::draw_history(&self.history, output)
-    }
-}
-
-impl Default for MockTimeline {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Clone)]
-pub enum MockLayer {
-    Delta(Arc<MockDeltaLayer>),
-    Image(Arc<MockImageLayer>),
-}
-
-impl interface::CompactionLayer<Key> for MockLayer {
-    fn key_range(&self) -> &Range<Key> {
-        match self {
-            MockLayer::Delta(this) => this.key_range(),
-            MockLayer::Image(this) => this.key_range(),
-        }
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        match self {
-            MockLayer::Delta(this) => this.lsn_range(),
-            MockLayer::Image(this) => this.lsn_range(),
-        }
-    }
-    fn file_size(&self) -> u64 {
-        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
-        }
-    }
-    fn short_id(&self) -> String {
-        match self {
-            MockLayer::Delta(this) => this.short_id(),
-            MockLayer::Image(this) => this.short_id(),
-        }
-    }
-
-    fn is_delta(&self) -> bool {
-        match self {
-            MockLayer::Delta(_) => true,
-            MockLayer::Image(_) => false,
-        }
-    }
-}
-
-impl MockLayer {
-    fn is_deleted(&self) -> bool {
-        let guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        *guard
-    }
-    fn mark_deleted(&self) {
-        let mut deleted_guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        assert!(!*deleted_guard, "layer already deleted");
-        *deleted_guard = true;
-    }
-}
-
-impl From<&Arc<MockDeltaLayer>> for MockLayer {
-    fn from(l: &Arc<MockDeltaLayer>) -> Self {
-        MockLayer::Delta(l.clone())
-    }
-}
-
-impl From<&Arc<MockImageLayer>> for MockLayer {
-    fn from(l: &Arc<MockImageLayer>) -> Self {
-        MockLayer::Image(l.clone())
-    }
-}
-
-#[async_trait]
-impl interface::CompactionJobExecutor for MockTimeline {
-    type Key = Key;
-    type Layer = MockLayer;
-    type DeltaLayer = Arc<MockDeltaLayer>;
-    type ImageLayer = Arc<MockImageLayer>;
-    type RequestContext = MockRequestContext;
-
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>> {
-        // Clear any deleted layers from our vec
-        self.live_layers.retain(|l| !l.is_deleted());
-
-        let layers: Vec<MockLayer> = self
-            .live_layers
-            .iter()
-            .filter(|l| {
-                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
-            })
-            .cloned()
-            .collect();
-
-        Ok(layers)
-    }
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        _lsn: Lsn,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
-        // find it in the levels
-        if self.old_keyspaces.is_empty() {
-            Ok(crate::helpers::intersect_keyspace(
-                &self.keyspace,
-                key_range,
-            ))
-        } else {
-            // not implemented
-
-            // The mock implementation only allows requesting the
-            // keyspace at the level's end LSN. That's all that the
-            // current implementation needs.
-            panic!("keyspace not available for requested lsn");
-        }
-    }
-
-    async fn downcast_delta_layer(
-        &self,
-        layer: &MockLayer,
-    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
-        Ok(match layer {
-            MockLayer::Delta(l) => Some(l.clone()),
-            MockLayer::Image(_) => None,
-        })
-    }
-
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
-
-        let mut accum_size: u64 = 0;
-        for r in keyspace {
-            accum_size += r.end - r.start;
-        }
-
-        let new_layer = Arc::new(MockImageLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created image layer, size {}: {}",
-            new_layer.file_size,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Image(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += new_layer.file_size;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateImage,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Key>,
-        input_layers: &[Arc<MockDeltaLayer>],
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let mut key_value_stream =
-            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
-        let mut records: Vec<MockRecord> = Vec::new();
-        let mut total_len = 2;
-        while let Some(delta_entry) = key_value_stream.next().await {
-            let delta_entry: MockRecord = delta_entry?;
-            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
-                total_len += delta_entry.len;
-                records.push(delta_entry);
-            }
-        }
-        let total_records = records.len();
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn_range.clone(),
-            file_size: total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created delta layer, recs {}, size {}: {}",
-            total_records,
-            total_len,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += total_len;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateDelta,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        _ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let layer = std::pin::pin!(layer);
-        info!("deleting layer: {}", layer.short_id());
-        self.num_deleted_layers += 1;
-        self.bytes_deleted += layer.file_size();
-        layer.mark_deleted();
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Delete,
-            file: LayerTraceFile {
-                filename: layer.short_id(),
-                key_range: layer.key_range().clone(),
-                lsn_range: layer.lsn_range().clone(),
-            },
-        });
-
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -1,411 +0,0 @@
-use super::Key;
-use anyhow::Result;
-use std::cmp::Ordering;
-use std::{
-    collections::{BTreeMap, BTreeSet, HashSet},
-    fmt::Write,
-    ops::Range,
-};
-use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
-use utils::lsn::Lsn;
-
-// Map values to their compressed coordinate - the index the value
-// would have in a sorted and deduplicated list of all values.
-struct CoordinateMap<T: Ord + Copy> {
-    map: BTreeMap<T, usize>,
-    stretch: f32,
-}
-
-impl<T: Ord + Copy> CoordinateMap<T> {
-    fn new(coords: Vec<T>, stretch: f32) -> Self {
-        let set: BTreeSet<T> = coords.into_iter().collect();
-
-        let mut map: BTreeMap<T, usize> = BTreeMap::new();
-        for (i, e) in set.iter().enumerate() {
-            map.insert(*e, i);
-        }
-
-        Self { map, stretch }
-    }
-
-    // This assumes that the map contains an exact point for this.
-    // Use map_inexact for values inbetween
-    fn map(&self, val: T) -> f32 {
-        *self.map.get(&val).unwrap() as f32 * self.stretch
-    }
-
-    // the value is still assumed to be within the min/max bounds
-    // (this is currently unused)
-    fn _map_inexact(&self, val: T) -> f32 {
-        let prev = *self.map.range(..=val).next().unwrap().1;
-        let next = *self.map.range(val..).next().unwrap().1;
-
-        // interpolate
-        (prev as f32 + (next - prev) as f32) * self.stretch
-    }
-
-    fn max(&self) -> f32 {
-        self.map.len() as f32 * self.stretch
-    }
-}
-
-#[derive(PartialEq, Hash, Eq)]
-pub enum LayerTraceOp {
-    Flush,
-    CreateDelta,
-    CreateImage,
-    Delete,
-}
-
-impl std::fmt::Display for LayerTraceOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        let op_str = match self {
-            LayerTraceOp::Flush => "flush",
-            LayerTraceOp::CreateDelta => "create_delta",
-            LayerTraceOp::CreateImage => "create_image",
-            LayerTraceOp::Delete => "delete",
-        };
-        f.write_str(op_str)
-    }
-}
-
-#[derive(PartialEq, Hash, Eq, Clone)]
-pub struct LayerTraceFile {
-    pub filename: String,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-}
-
-impl LayerTraceFile {
-    fn is_image(&self) -> bool {
-        self.lsn_range.end == self.lsn_range.start
-    }
-}
-
-pub struct LayerTraceEvent {
-    pub time_rel: u64,
-    pub op: LayerTraceOp,
-    pub file: LayerTraceFile,
-}
-
-pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
-    let mut files: Vec<LayerTraceFile> = Vec::new();
-
-    for event in history {
-        files.push(event.file.clone());
-    }
-    let last_time_rel = history.last().unwrap().time_rel;
-
-    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
-    for f in files.iter() {
-        keys.push(f.key_range.start);
-        keys.push(f.key_range.end);
-        lsns.push(f.lsn_range.start);
-        lsns.push(f.lsn_range.end);
-    }
-
-    // Analyze
-    let key_map = CoordinateMap::new(keys, 2.0);
-    // Stretch out vertically for better visibility
-    let lsn_map = CoordinateMap::new(lsns, 3.0);
-
-    let mut svg = String::new();
-
-    // Draw
-    writeln!(
-        svg,
-        "{}",
-        BeginSvg {
-            w: key_map.max(),
-            h: lsn_map.max(),
-        }
-    )?;
-    let lsn_max = lsn_map.max();
-
-    // Sort the files by LSN, but so that image layers go after all delta layers
-    // The SVG is painted in the order the elements appear, and we want to draw
-    // image layers on top of the delta layers if they overlap
-    //
-    // (This could also be implemented via z coordinates: image layers get one z
-    // coord, delta layers get another z coord.)
-    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
-    files_sorted.sort_by(|a, b| {
-        if a.is_image() && !b.is_image() {
-            Ordering::Greater
-        } else if !a.is_image() && b.is_image() {
-            Ordering::Less
-        } else {
-            a.lsn_range.end.cmp(&b.lsn_range.end)
-        }
-    });
-
-    writeln!(svg, "<!-- layers -->")?;
-    let mut files_seen = HashSet::new();
-    for f in files_sorted {
-        if files_seen.contains(&f) {
-            continue;
-        }
-        let key_start = key_map.map(f.key_range.start);
-        let key_end = key_map.map(f.key_range.end);
-        let key_diff = key_end - key_start;
-
-        if key_start >= key_end {
-            panic!("Invalid key range {}-{}", key_start, key_end);
-        }
-
-        let lsn_start = lsn_map.map(f.lsn_range.start);
-        let lsn_end = lsn_map.map(f.lsn_range.end);
-
-        // Fill in and thicken rectangle if it's an
-        // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
-
-        let y_start = lsn_max - lsn_start;
-        let y_end = lsn_max - lsn_end;
-
-        let x_margin = 0.25;
-        let y_margin = 0.5;
-
-        match f.lsn_range.start.cmp(&f.lsn_range.end) {
-            Ordering::Less => {
-                write!(
-                    svg,
-                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end + y_margin,
-                    key_diff - x_margin * 2.0,
-                    y_start - y_end - y_margin * 2.0,
-                    1.0, // border_radius,
-                    style,
-                )?;
-                write!(svg, "<title>{}</title>", f.filename)?;
-                writeln!(svg, "</rect>")?;
-            }
-            Ordering::Equal => {
-                //lsn_diff = 0.3;
-                //lsn_offset = -lsn_diff / 2.0;
-                //margin = 0.05;
-                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
-                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
-                write!(
-                    svg,
-                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end,
-                    key_end - x_margin,
-                    y_end,
-                    style,
-                )?;
-                write!(
-                    svg,
-                    "<title>{}<br>{} - {}</title>",
-                    f.filename, lsn_end, y_end
-                )?;
-                writeln!(svg, "</line>")?;
-            }
-            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
-        }
-        files_seen.insert(f);
-    }
-
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
-    writeln!(svg, "{}", EndSvg)?;
-
-    let mut layer_events_str = String::new();
-    let mut first = true;
-    for e in history {
-        if !first {
-            writeln!(layer_events_str, ",")?;
-        }
-        write!(
-            layer_events_str,
-            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
-            e.time_rel, e.file.filename, e.op
-        )?;
-        first = false;
-    }
-    writeln!(layer_events_str)?;
-
-    writeln!(
-        output,
-        r#"<!DOCTYPE html>
-<html>
-<head>
-<style>
-/* Keep the slider pinned at top */
-.topbar {{
-  display: block;
-  overflow: hidden;
-  background-color: lightgrey;
-  position: fixed;
-  top: 0;
-  width: 100%;
-/*  width: 500px; */
-}}
-.slidercontainer {{
-  float: left;
-  width: 50%;
-  margin-right: 200px;
-}}
-.slider {{
-  float: left;
-  width: 100%;
-}}
-.legend {{
-  width: 200px;
-  float: right;
-}}
-
-/* Main content */
-.main {{
-  margin-top: 50px; /* Add a top margin to avoid content overlay */
-}}
-</style>
-</head>
-
-  <body onload="init()">
-    <script type="text/javascript">
-
-      var layer_events = [{layer_events_str}]
-
-      let ticker;
-
-      function init() {{
-          for (let i = 0; i < layer_events.length; i++) {{
-              var layer = document.getElementById("layer_" + layer_events[i].filename);
-              layer.style.visibility = "hidden";
-          }}
-          last_layer_event = -1;
-          moveSlider(last_slider_pos)
-      }}
-
-      function startAnimation() {{
-          ticker = setInterval(animateStep, 100);
-      }}
-      function stopAnimation() {{
-          clearInterval(ticker);
-      }}
-
-      function animateStep() {{
-          if (last_layer_event < layer_events.length - 1) {{
-              var slider = document.getElementById("time-slider");
-              let prevPos = slider.value
-              let nextEvent = last_layer_event + 1
-              while (nextEvent <= layer_events.length - 1) {{
-                  if (layer_events[nextEvent].time_rel > prevPos) {{
-                      break;
-                  }}
-                  nextEvent += 1;
-              }}
-              let nextPos = layer_events[nextEvent].time_rel
-              slider.value = nextPos
-              moveSlider(nextPos)
-          }}
-      }}
-
-      function redoLayerEvent(n, dir) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "visible";
-                  break;
-              case "delete":
-                  layer.style.visibility = "hidden";
-                  break;
-          }}
-      }}
-      function undoLayerEvent(n) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "delete":
-                  layer.style.visibility = "visible";
-                  break;
-          }}
-      }}
-
-      var last_slider_pos = 0
-      var last_layer_event = 0
-
-      var moveSlider = function(new_pos) {{
-          if (new_pos > last_slider_pos) {{
-              while (last_layer_event < layer_events.length - 1) {{
-                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
-                      break;
-                  }}
-                  last_layer_event += 1;
-                  redoLayerEvent(last_layer_event)
-              }}
-          }}
-          if (new_pos < last_slider_pos) {{
-              while (last_layer_event >= 0) {{
-                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
-                      break;
-                  }}
-                  undoLayerEvent(last_layer_event)
-                  last_layer_event -= 1;
-              }}
-          }}
-          last_slider_pos = new_pos;
-          document.getElementById("debug_pos").textContent=new_pos;
-          if (last_layer_event >= 0) {{
-              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
-          }} else {{
-              document.getElementById("debug_layer_event").textContent="begin";
-          }}
-      }}
-    </script>
-
-    <div class="topbar">
-      <div class="slidercontainer">
-        <label for="time-slider">TIME</label>:
-        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
-
-        pos: <span id="debug_pos"></span><br>
-        event: <span id="debug_layer_event"></span><br>
-        gc: <span id="debug_gc_event"></span><br>
-      </div>
-
-      <button onclick="startAnimation()">Play</button>
-      <button onclick="stopAnimation()">Stop</button>
-
-      <svg class="legend">
-        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-      </svg>
-    </div>
-
-    <div class="main">
-{svg}
-    </div>
-  </body>
-</html>
-"#
-    )?;
-
-    Ok(())
-}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,37 +0,0 @@
-use pageserver_compaction::interface::CompactionLayer;
-use pageserver_compaction::simulator::MockTimeline;
-
-/// Test the extreme case that there are so many updates for a single key that
-/// even if we produce an extremely narrow delta layer, spanning just that one
-/// key, we still too many records to fit in the target file size. We need to
-/// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
-#[tokio::test]
-async fn test_many_updates_for_single_key() -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
-
-    // Ingest 100 MB of updates to a single key.
-    for _ in 1..1000 {
-        executor.ingest_uniform(100, 10, &(0..100_000))?;
-        executor.ingest_uniform(10_000, 10, &(0..1))?;
-        executor.compact().await?;
-    }
-
-    // Check that all the layers are smaller than the target size (with some slop)
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-    for l in executor.live_layers.iter() {
-        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
-        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
-        }
-    }
-
-    Ok(())
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -880,13 +880,6 @@ impl PageServerConf {
            );
        }

-        if let Some(compaction_algorithm) = item.get("compaction_algorithm") {
-            t_conf.compaction_algorithm = Some(
-                deserialize_from_item("compaction_algorithm", compaction_algorithm)
-                    .context("parse compaction_algorithm")?,
-            );
-        }
-
        if let Some(gc_horizon) = item.get("gc_horizon") {
            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
        }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -16,7 +16,7 @@ use tracing::*;
 use utils::id::NodeId;

 mod metrics;
-use crate::consumption_metrics::metrics::MetricsKey;
+use metrics::MetricsKey;
 mod disk_cache;
 mod upload;

--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,10 +57,7 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert(
-                "Authorization",
-                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
-            );
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
            client = client.default_headers(headers);
        }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -272,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
-
-        Ok(())
+            .map_err(Into::into)
    }
 }

@@ -363,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
-            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
-            .await
-            .fatal_err("read deletion directory");
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
-                    .await
-                    .fatal_err("delete temp file");
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+                }

                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path)
-                .await
-                .fatal_err("read deletion list");
+            let list_bytes = tokio::fs::read(&list_path).await?;

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
-                .await
-                .fatal_err("remove deletion list");
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {list_path}: {e:#}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
        }
    }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -569,17 +569,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
+          description: Tenant download is already in progress
          content:
            application/json:
              schema:
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -172,21 +172,6 @@ impl Key {
    }
 }

-impl pageserver_compaction::interface::CompactionKey for Key {
-    const MIN: Self = Self::MIN;
-    const MAX: Self = Self::MAX;
-
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
-        key_range_size(r)
-    }
-    fn next(&self) -> Key {
-        (self as &Key).next()
-    }
-    fn skip_some(&self) -> Key {
-        self.add(128)
-    }
-}
-
 /// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum Value {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3443,7 +3443,6 @@ pub(crate) mod harness {
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
-                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -23,17 +23,12 @@ pub mod defaults {
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";

-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
-        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -275,7 +270,6 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -339,10 +333,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub compaction_threshold: Option<usize>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub gc_horizon: Option<u64>,
@@ -397,13 +387,6 @@ pub struct TenantConfOpt {
    pub gc_feedback: Option<bool>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum CompactionAlgorithm {
-    Legacy,
-    Tiered,
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -446,9 +429,6 @@ impl TenantConfOpt {
            compaction_threshold: self
                .compaction_threshold
                .unwrap_or(global_conf.compaction_threshold),
-            compaction_algorithm: self
-                .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -488,7 +468,6 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -577,12 +556,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {

        tenant_conf.compaction_target_size = request_data.compaction_target_size;
        tenant_conf.compaction_threshold = request_data.compaction_threshold;
-        if let Some(compaction_algorithm) = &request_data.compaction_algorithm {
-            tenant_conf.compaction_algorithm = Some(
-                serde::Deserialize::deserialize(compaction_algorithm)
-                    .context("parse field `compaction_algorithm`")?,
-            );
-        }

        if let Some(compaction_period) = &request_data.compaction_period {
            tenant_conf.compaction_period = Some(
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1542,7 +1542,7 @@ pub fn remote_index_path(
 }

 /// Given the key of an index, parse out the generation part of the name
-pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
        Some(f) => f,
        None => {
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -155,7 +155,7 @@ pub struct IndexLayerMetadata {

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
-    pub generation: Generation,
+    pub(super) generation: Generation,
 }

 impl From<LayerFileMetadata> for IndexLayerMetadata {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -882,15 +882,3 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
        self
    }
 }
-
-impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
-    fn key(&self) -> Key {
-        self.key
-    }
-    fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-    fn size(&self) -> u64 {
-        self.size
-    }
-}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -125,7 +125,6 @@ impl Layer {
            let inner = Arc::new(DownloadedLayer {
                owner: owner.clone(),
                kind: tokio::sync::OnceCell::default(),
-                version: 0,
            });
            resident = Some(inner.clone());

@@ -164,7 +163,6 @@ impl Layer {
            let inner = Arc::new(DownloadedLayer {
                owner: owner.clone(),
                kind: tokio::sync::OnceCell::default(),
-                version: 0,
            });
            resident = Some(inner.clone());
            let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
@@ -330,46 +328,42 @@ impl Layer {
 /// read with [`Layer::get_value_reconstruct_data`].
 ///
 /// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
-#[derive(Debug)]
 enum ResidentOrWantedEvicted {
    Resident(Arc<DownloadedLayer>),
-    WantedEvicted(Weak<DownloadedLayer>, usize),
+    WantedEvicted(Weak<DownloadedLayer>),
 }

 impl ResidentOrWantedEvicted {
-    fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
        match self {
-            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
-            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
+            ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
+            ResidentOrWantedEvicted::WantedEvicted(weak) => match weak.upgrade() {
                Some(strong) => {
                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
-
-                    *self = ResidentOrWantedEvicted::Resident(strong.clone());
-
-                    Some((strong, true))
+                    Some(strong)
                }
                None => None,
            },
        }
    }
-
    /// When eviction is first requested, drop down to holding a [`Weak`].
    ///
-    /// Returns `Some` if this was the first time eviction was requested. Care should be taken to
-    /// drop the possibly last strong reference outside of the mutex of
-    /// heavier_once_cell::OnceCell.
-    fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
-        match self {
+    /// Returns `true` if this was the first time eviction was requested.
+    fn downgrade(&mut self) -> &Weak<DownloadedLayer> {
+        let _was_first = match self {
            ResidentOrWantedEvicted::Resident(strong) => {
                let weak = Arc::downgrade(strong);
-                let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
-                std::mem::swap(self, &mut temp);
-                match temp {
-                    ResidentOrWantedEvicted::Resident(strong) => Some(strong),
-                    ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
-                }
+                *self = ResidentOrWantedEvicted::WantedEvicted(weak);
+                // returning the weak is not useful, because the drop could had already ran with
+                // the replacement above, and that will take care of cleaning the Option we are in
+                true
            }
-            ResidentOrWantedEvicted::WantedEvicted(..) => None,
+            ResidentOrWantedEvicted::WantedEvicted(_) => false,
+        };
+
+        match self {
+            ResidentOrWantedEvicted::WantedEvicted(ref weak) => weak,
+            _ => unreachable!("just wrote wanted evicted"),
        }
    }
 }
@@ -404,17 +398,11 @@ struct LayerInner {
    /// [`LayerInner::on_downloaded_layer_drop`].
    wanted_evicted: AtomicBool,

-    /// Version is to make sure we will only evict a specific download of a file.
-    ///
-    /// Incremented for each download, stored in `DownloadedLayer::version` or
-    /// `ResidentOrWantedEvicted::WantedEvicted`.
+    /// Version is to make sure we will in fact only evict a file if no new download has been
+    /// started.
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
-    ///
-    /// If in future we need to implement "wait until layer instances are gone and done", carrying
-    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
-    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -527,14 +515,6 @@ impl LayerInner {
            .timeline_path(&timeline.tenant_id, &timeline.timeline_id)
            .join(desc.filename().to_string());

-        let (inner, version) = if let Some(inner) = downloaded {
-            let version = inner.version;
-            let resident = ResidentOrWantedEvicted::Resident(inner);
-            (heavier_once_cell::OnceCell::new(resident), version)
-        } else {
-            (heavier_once_cell::OnceCell::default(), 0)
-        };
-
        LayerInner {
            conf,
            path,
@@ -544,8 +524,12 @@ impl LayerInner {
            access_stats,
            wanted_garbage_collected: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
-            inner,
-            version: AtomicUsize::new(version),
+            inner: if let Some(inner) = downloaded {
+                heavier_once_cell::OnceCell::new(ResidentOrWantedEvicted::Resident(inner))
+            } else {
+                heavier_once_cell::OnceCell::default()
+            },
+            version: AtomicUsize::new(0),
            status: tokio::sync::broadcast::channel(1).0,
            consecutive_failures: AtomicUsize::new(0),
            generation,
@@ -565,8 +549,6 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe, however dropping the future and calling this method again might result
-    /// in a new attempt to evict OR join the previously started attempt.
    pub(crate) async fn evict_and_wait(
        &self,
        _: &RemoteTimelineClient,
@@ -577,22 +559,20 @@ impl LayerInner {

        let mut rx = self.status.subscribe();

-        let strong = {
-            match self.inner.get() {
-                Some(mut either) => {
-                    self.wanted_evicted.store(true, Ordering::Relaxed);
-                    either.downgrade()
-                }
-                None => return Err(EvictionError::NotFound),
-            }
-        };
+        let res =
+            self.wanted_evicted
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

-        if strong.is_some() {
-            // drop the DownloadedLayer outside of the holding the guard
-            drop(strong);
+        if res.is_ok() {
            LAYER_IMPL_METRICS.inc_started_evictions();
        }

+        if self.get().is_none() {
+            // it was not evictable in the first place
+            // our store to the wanted_evicted does not matter; it will be reset by next download
+            return Err(EvictionError::NotFound);
+        }
+
        match rx.recv().await {
            Ok(Status::Evicted) => Ok(()),
            Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
@@ -606,8 +586,7 @@ impl LayerInner {
                //
                // use however late (compared to the initial expressing of wanted) as the
                // "outcome" now
-                LAYER_IMPL_METRICS.inc_broadcast_lagged();
-                match self.inner.get() {
+                match self.get() {
                    Some(_) => Err(EvictionError::Downloaded),
                    None => Ok(()),
                }
@@ -615,19 +594,17 @@ impl LayerInner {
        }
    }

-    /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
+    /// Should be cancellation safe, but cancellation is troublesome together with the spawned
+    /// download.
    async fn get_or_maybe_download(
        self: &Arc<Self>,
        allow_download: bool,
        ctx: Option<&RequestContext>,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
-        let mut init_permit = None;
-
        loop {
-            let download = move |permit| async move {
+            let download = move || async move {
                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+                self.version.fetch_add(1, Ordering::Relaxed);

                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));
@@ -646,11 +623,7 @@ impl LayerInner {
                    .await
                    .map_err(DownloadError::PreStatFailed)?;

-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
-                    }
-
+                if let Some(reason) = needs_download {
                    // only reset this after we've decided we really need to download. otherwise it'd
                    // be impossible to mark cancelled downloads for eviction, like one could imagine
                    // we would like to do for prefetching which was not needed.
@@ -660,6 +633,8 @@ impl LayerInner {
                        return Err(DownloadError::NoRemoteStorage);
                    }

+                    tracing::debug!(%reason, "downloading layer");
+
                    if let Some(ctx) = ctx {
                        self.check_expected_download(ctx)?;
                    }
@@ -670,21 +645,16 @@ impl LayerInner {
                        return Err(DownloadError::DownloadRequired);
                    }

-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
+                    self.spawn_download_and_wait(timeline).await?;
                } else {
                    // the file is present locally, probably by a previous but cancelled call to
                    // get_or_maybe_download. alternatively we might be running without remote storage.
                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
+                }

                let res = Arc::new(DownloadedLayer {
                    owner: Arc::downgrade(self),
                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
                });

                self.access_stats.record_residence_event(
@@ -692,60 +662,19 @@ impl LayerInner {
                    LayerResidenceEventReason::ResidenceChange,
                );

-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                Ok(ResidentOrWantedEvicted::Resident(res))
            };

-            if let Some(init_permit) = init_permit.take() {
-                // use the already held initialization permit because it is impossible to hit the
-                // below paths anymore essentially limiting the max loop iterations to 2.
-                let (value, init_permit) = download(init_permit).await?;
-                let mut guard = self.inner.set(value, init_permit);
-                let (strong, _upgraded) = guard
-                    .get_and_upgrade()
-                    .expect("init creates strong reference, we held the init permit");
+            let locked = self.inner.get_or_init(download).await?;
+
+            if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
+            {
                return Ok(strong);
            }

-            let (weak, permit) = {
-                let mut locked = self.inner.get_or_init(download).await?;
-
-                if let Some((strong, upgraded)) = locked.get_and_upgrade() {
-                    if upgraded {
-                        // when upgraded back, the Arc<DownloadedLayer> is still available, but
-                        // previously a `evict_and_wait` was received.
-                        self.wanted_evicted.store(false, Ordering::Relaxed);
-
-                        // error out any `evict_and_wait`
-                        drop(self.status.send(Status::Downloaded));
-                        LAYER_IMPL_METRICS
-                            .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
-                    }
-
-                    return Ok(strong);
-                } else {
-                    // path to here: the evict_blocking is stuck on spawn_blocking queue.
-                    //
-                    // reset the contents, deactivating the eviction and causing a
-                    // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
-                    locked.take_and_deinit()
-                }
-            };
-
-            // unlock first, then drop the weak, but because upgrade failed, we
-            // know it cannot be a problem.
-
-            assert!(
-                matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
-                "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
-            );
-
-            init_permit = Some(permit);
-
+            // the situation in which we might need to retry is that our init was ready
+            // immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
+            // Self::evict_blocking
            LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
        }
    }
@@ -757,8 +686,8 @@ impl LayerInner {
        match b {
            Download => Ok(()),
            Warn | Error => {
-                tracing::info!(
-                    "unexpectedly on-demand downloading for task kind {:?}",
+                tracing::warn!(
+                    "unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
                    ctx.task_kind()
                );
                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
@@ -780,17 +709,14 @@ impl LayerInner {
    async fn spawn_download_and_wait(
        self: &Arc<Self>,
        timeline: Arc<Timeline>,
-        permit: heavier_once_cell::InitPermit,
-    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
+    ) -> Result<(), DownloadError> {
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
-
        // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
        // block tenant::mgr::remove_tenant_from_memory.

        let this: Arc<Self> = self.clone();
-
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
@@ -799,7 +725,6 @@ impl LayerInner {
            &task_name,
            false,
            async move {
-
                let client = timeline
                    .remote_client
                    .as_ref()
@@ -821,9 +746,9 @@ impl LayerInner {
                    }
                };

-                if let Err(res) = tx.send((result, permit)) {
+                if let Err(res) = tx.send(result) {
                    match res {
-                        (Ok(()), _) => {
+                        Ok(()) => {
                            // our caller is cancellation safe so this is fine; if someone
                            // else requests the layer, they'll find it already downloaded
                            // or redownload.
@@ -834,7 +759,7 @@ impl LayerInner {
                            tracing::info!("layer file download completed after requester had cancelled");
                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
-                        (Err(e), _) => {
+                        Err(e) => {
                            // our caller is cancellation safe, but we might be racing with
                            // another attempt to initialize. before we have cancellation
                            // token support: these attempts should converge regardless of
@@ -850,7 +775,7 @@ impl LayerInner {
            .in_current_span(),
        );
        match rx.await {
-            Ok((Ok(()), permit)) => {
+            Ok(Ok(())) => {
                if let Some(reason) = self
                    .needs_download()
                    .await
@@ -861,12 +786,10 @@ impl LayerInner {
                }

                self.consecutive_failures.store(0, Ordering::Relaxed);
-                tracing::info!("on-demand download successful");

-                Ok(permit)
+                Ok(())
            }
-            Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
+            Ok(Err(e)) => {
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -884,6 +807,33 @@ impl LayerInner {
        }
    }

+    /// Access the current state without waiting for the file to be downloaded.
+    ///
+    /// Requires that we've initialized to state which is respective to the
+    /// actual residency state.
+    fn get(&self) -> Option<Arc<DownloadedLayer>> {
+        let locked = self.inner.get();
+        Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
+    }
+
+    fn get_or_apply_evictedness(
+        guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
+        wanted_evicted: &AtomicBool,
+    ) -> Option<Arc<DownloadedLayer>> {
+        if let Some(mut x) = guard {
+            if let Some(won) = x.get() {
+                // there are no guarantees that we will always get to observe a concurrent call
+                // to evict
+                if wanted_evicted.load(Ordering::Acquire) {
+                    x.downgrade();
+                }
+                return Some(won);
+            }
+        }
+
+        None
+    }
+
    async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
        match tokio::fs::metadata(&self.path).await {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -903,7 +853,7 @@ impl LayerInner {
    fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
        // in future, this should include sha2-256 validation of the file.
        if !m.is_file() {
-            Err(NeedsDownload::NotFile(m.file_type()))
+            Err(NeedsDownload::NotFile)
        } else if m.len() != self.desc.file_size {
            Err(NeedsDownload::WrongSize {
                actual: m.len(),
@@ -917,9 +867,7 @@ impl LayerInner {
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
        let layer_file_name = self.desc.filename().file_name();

-        // this is not accurate: we could have the file locally but there was a cancellation
-        // and now we are not in sync, or we are currently downloading it.
-        let remote = self.inner.get().is_none();
+        let remote = self.get().is_none();

        let access_stats = self.access_stats.as_api_model(reset);

@@ -948,7 +896,7 @@ impl LayerInner {
    }

    /// `DownloadedLayer` is being dropped, so it calls this method.
-    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
+    fn on_downloaded_layer_drop(self: Arc<LayerInner>) {
        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;
@@ -956,16 +904,15 @@ impl LayerInner {
        if gc {
            // do nothing now, only in LayerInner::drop
        } else if can_evict && evict {
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
+            let version = self.version.load(Ordering::Relaxed);
+
+            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self);

            // downgrade for queueing, in case there's a tear down already ongoing we should not
            // hold it alive.
            let this = Arc::downgrade(&self);
            drop(self);

-            // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
-            // drop while the `self.inner` is being locked, leading to a deadlock.
-
            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                let _g = span.entered();

@@ -975,15 +922,19 @@ impl LayerInner {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
                    return;
                };
-                match this.evict_blocking(version) {
-                    Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
-                    Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
-                }
+                this.evict_blocking(version);
            });
        }
    }

-    fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
+    fn evict_blocking(&self, version: usize) {
+        match self.evict_blocking0(version) {
+            Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+            Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
+        }
+    }
+
+    fn evict_blocking0(&self, version: usize) -> Result<(), EvictionCancelled> {
        // deleted or detached timeline, don't do anything.
        let Some(timeline) = self.timeline.upgrade() else {
            return Err(EvictionCancelled::TimelineGone);
@@ -994,34 +945,32 @@ impl LayerInner {
        let _permit = {
            let maybe_downloaded = self.inner.get();

-            let (_weak, permit) = match maybe_downloaded {
-                Some(mut guard) => {
-                    if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
-                        if *version == only_version {
-                            guard.take_and_deinit()
-                        } else {
-                            // this was not for us; maybe there's another eviction job
-                            // TODO: does it make any sense to stall here? unique versions do not
-                            // matter, we only want to make sure not to evict a resident, which we
-                            // are not doing.
-                            return Err(EvictionCancelled::VersionCheckFailed);
-                        }
-                    } else {
-                        return Err(EvictionCancelled::AlreadyReinitialized);
-                    }
+            if version != self.version.load(Ordering::Relaxed) {
+                // downloadness-state has advanced, we might no longer be the latest eviction
+                // work; don't do anything.
+                //
+                // this is possible to get to by having:
+                //
+                // 1. wanted_evicted.store(true)
+                // 2. ResidentOrWantedEvicted::downgrade
+                // 3. DownloadedLayer::drop
+                // 4. LayerInner::get_or_maybe_download
+                // 5. LayerInner::evict_blocking
+                return Err(EvictionCancelled::VersionCheckFailed);
+            }
+
+            // free the DownloadedLayer allocation
+            match maybe_downloaded.map(|mut g| g.take_and_deinit()) {
+                Some((taken, permit)) => {
+                    assert!(matches!(taken, ResidentOrWantedEvicted::WantedEvicted(_)));
+                    permit
                }
                None => {
-                    // already deinitialized, perhaps get_or_maybe_download did this and is
-                    // currently waiting to reinitialize it
-                    return Err(EvictionCancelled::LostToDownload);
+                    unreachable!("we do the version checking for this exact reason")
                }
-            };
-
-            permit
+            }
        };

-        // now accesses to inner.get_or_init wait on the semaphore or the `_permit`
-
        self.access_stats.record_residence_event(
            LayerResidenceStatus::Evicted,
            LayerResidenceEventReason::ResidenceChange,
@@ -1054,14 +1003,11 @@ impl LayerInner {
                Ok(())
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                tracing::error!(
-                    layer_size = %self.desc.file_size,
-                    "failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
-                );
+                tracing::info!("failed to evict file from disk, it was already gone");
                Err(EvictionCancelled::FileNotFound)
            }
            Err(e) => {
-                tracing::error!("failed to evict file from disk: {e:#}");
+                tracing::warn!("failed to evict file from disk: {e:#}");
                Err(EvictionCancelled::RemoveFailed)
            }
        };
@@ -1105,8 +1051,6 @@ enum DownloadError {
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
    DownloadRequired,
-    #[error("layer path exists, but it is not a file: {0:?}")]
-    NotFile(std::fs::FileType),
    /// Why no error here? Because it will be reported by page_service. We should had also done
    /// retries already.
    #[error("downloading evicted layer file failed")]
@@ -1122,7 +1066,7 @@ enum DownloadError {
 #[derive(Debug, PartialEq)]
 pub(crate) enum NeedsDownload {
    NotFound,
-    NotFile(std::fs::FileType),
+    NotFile,
    WrongSize { actual: u64, expected: u64 },
 }

@@ -1130,7 +1074,7 @@ impl std::fmt::Display for NeedsDownload {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            NeedsDownload::NotFound => write!(f, "file was not found"),
-            NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
+            NeedsDownload::NotFile => write!(f, "path is not a file"),
            NeedsDownload::WrongSize { actual, expected } => {
                write!(f, "file size mismatch {actual} vs. {expected}")
            }
@@ -1141,10 +1085,7 @@ impl std::fmt::Display for NeedsDownload {
 /// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
 pub(crate) struct DownloadedLayer {
    owner: Weak<LayerInner>,
-    // Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
-    // DownloadedLayer
    kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
-    version: usize,
 }

 impl std::fmt::Debug for DownloadedLayer {
@@ -1152,7 +1093,6 @@ impl std::fmt::Debug for DownloadedLayer {
        f.debug_struct("DownloadedLayer")
            // owner omitted because it is always "Weak"
            .field("kind", &self.kind)
-            .field("version", &self.version)
            .finish()
    }
 }
@@ -1160,7 +1100,7 @@ impl std::fmt::Debug for DownloadedLayer {
 impl Drop for DownloadedLayer {
    fn drop(&mut self) {
        if let Some(owner) = self.owner.upgrade() {
-            owner.on_downloaded_layer_drop(self.version);
+            owner.on_downloaded_layer_drop();
        } else {
            // no need to do anything, we are shutting down
        }
@@ -1186,6 +1126,7 @@ impl DownloadedLayer {
                "these are the same, just avoiding the upgrade"
            );

+            // there is nothing async here, but it should be async
            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
                    owner.desc.tenant_id,
@@ -1284,8 +1225,6 @@ impl std::fmt::Debug for ResidentLayer {

 impl ResidentLayer {
    /// Release the eviction guard, converting back into a plain [`Layer`].
-    ///
-    /// You can access the [`Layer`] also by using `as_ref`.
    pub(crate) fn drop_eviction_guard(self) -> Layer {
        self.into()
    }
@@ -1341,7 +1280,7 @@ impl AsRef<Layer> for ResidentLayer {
    }
 }

-/// Drop the eviction guard.
+/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
 impl From<ResidentLayer> for Layer {
    fn from(value: ResidentLayer) -> Self {
        value.owner
@@ -1511,13 +1450,6 @@ impl LayerImplMetrics {
            .unwrap()
            .inc();
    }
-
-    fn inc_broadcast_lagged(&self) {
-        self.rare_counters
-            .get_metric_with_label_values(&["broadcast_lagged"])
-            .unwrap()
-            .inc();
-    }
 }

 enum EvictionCancelled {
@@ -1526,11 +1458,6 @@ enum EvictionCancelled {
    VersionCheckFailed,
    FileNotFound,
    RemoveFailed,
-    AlreadyReinitialized,
-    /// Not evicted because of a pending reinitialization
-    LostToDownload,
-    /// After eviction, there was a new layer access which cancelled the eviction.
-    UpgradedBackOnAccess,
 }

 impl EvictionCancelled {
@@ -1541,9 +1468,6 @@ impl EvictionCancelled {
            EvictionCancelled::VersionCheckFailed => "version_check_fail",
            EvictionCancelled::FileNotFound => "file_not_found",
            EvictionCancelled::RemoveFailed => "remove_failed",
-            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
-            EvictionCancelled::LostToDownload => "lost_to_download",
-            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
        }
    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -162,11 +162,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -196,7 +193,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(
+                    started_at.elapsed(),
+                    period,
+                    BackgroundLoopKind::Compaction,
+                );
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -205,6 +211,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
@@ -239,11 +247,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let period = tenant.get_gc_period();

-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -277,7 +282,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -286,6 +296,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
@@ -361,7 +373,7 @@ pub(crate) fn warn_when_period_overrun(
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
        // intelligent. however it makes sense to keep the "configuration format" for period, even
        // though there's no way to output the actual config value.
-        info!(
+        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
            ?task,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,4 +1,3 @@
-mod compaction;
 pub mod delete;
 mod eviction_task;
 mod init;
@@ -60,7 +59,7 @@ use crate::metrics::{
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant::config::{CompactionAlgorithm, EvictionPolicy, TenantConfOpt};
+use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
 use pageserver_api::reltag::RelTag;

 use postgres_connection::PgConnectionConfig;
@@ -698,18 +697,6 @@ impl Timeline {
            return Ok(());
        }

-        match self.get_compaction_algorithm() {
-            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
-            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, ctx).await,
-        }
-    }
-
-    /// TODO: cancellation
-    async fn compact_legacy(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
        // High level strategy for compaction / image creation:
        //
        // 1. First, calculate the desired "partitioning" of the
@@ -1219,13 +1206,6 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

-    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
-        tenant_conf
-            .compaction_algorithm
-            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
-    }
-
    fn get_eviction_policy(&self) -> EvictionPolicy {
        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
@@ -3030,7 +3010,7 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
 }

 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`compact_legacy`] comment.
+    /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
    ///
    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
@@ -3513,23 +3493,6 @@ impl Timeline {
            return Ok(());
        }

-        self.finish_compact_batch(
-            layer_removal_cs,
-            &new_layers,
-            &Vec::new(),
-            &deltas_to_compact,
-        )
-        .await?;
-        Ok(())
-    }
-
-    async fn finish_compact_batch(
-        self: &Arc<Self>,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        new_deltas: &[ResidentLayer],
-        new_images: &[ResidentLayer],
-        layers_to_remove: &[Layer],
-    ) -> anyhow::Result<()> {
        // Before deleting any layers, we need to wait for their upload ops to finish.
        // See remote_timeline_client module level comment on consistency.
        // Do it here because we don't want to hold self.layers.write() while waiting.
@@ -3545,9 +3508,9 @@ impl Timeline {

        let mut duplicated_layers = HashSet::new();

-        let mut insert_layers = Vec::with_capacity(new_deltas.len());
+        let mut insert_layers = Vec::with_capacity(new_layers.len());

-        for l in new_deltas {
+        for l in &new_layers {
            if guard.contains(l.as_ref()) {
                // expected in tests
                tracing::error!(layer=%l, "duplicated L1 layer");
@@ -3558,22 +3521,18 @@ impl Timeline {
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
            } else if LayerMap::is_l0(l.layer_desc()) {
-                bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
+                return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
            } else {
                insert_layers.push(l.clone());
            }
        }

-        // only remove those inputs which were not outputs
-        let remove_layers: Vec<Layer> = layers_to_remove
-            .iter()
-            .filter(|l| !duplicated_layers.contains(&l.layer_desc().key()))
-            .cloned()
-            .collect();
-
-        if !new_images.is_empty() {
-            guard.track_new_image_layers(new_images, &self.metrics);
-        }
+        let remove_layers = {
+            let mut deltas_to_compact = deltas_to_compact;
+            // only remove those inputs which were not outputs
+            deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key()));
+            deltas_to_compact
+        };

        // deletion will happen later, the layer file manager calls garbage_collect_on_drop
        guard.finish_compact_l0(
@@ -3584,7 +3543,7 @@ impl Timeline {
        );

        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, &new_deltas)?;
+            remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
        }

        drop_wlock(guard);
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1,473 +0,0 @@
-//! New compaction implementation. The algorithm itself is implemented in the
-//! compaction crate. This file implements the callbacks and structs that allow
-//! the algorithm to drive the process.
-//!
-//! The old legacy algorithm is implemented directly in `timeline.rs`.
-
-use std::ops::Range;
-use std::sync::Arc;
-
-use super::Timeline;
-
-use async_trait::async_trait;
-use fail::fail_point;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
-
-use crate::context::RequestContext;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::DeltaLayer;
-use crate::tenant::PageReconstructError;
-use crate::ZERO_PAGE;
-
-use crate::keyspace::KeySpace;
-use crate::repository::Key;
-
-use utils::lsn::Lsn;
-
-use pageserver_compaction::helpers::overlaps_with;
-use pageserver_compaction::interface::*;
-
-use super::CompactionError;
-
-impl Timeline {
-    /// Entry point for new tiered compaction algorithm.
-    ///
-    /// All the real work is in the implementation in the pageserver_compaction
-    /// crate. The code here would apply to any algorithm implemented by the
-    /// same interface, but tiered is the only one at the moment.
-    ///
-    /// TODO: cancellation
-    pub(crate) async fn compact_tiered(
-        self: &Arc<Self>,
-        _cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
-        let fanout = self.get_compaction_threshold() as u64;
-        let target_file_size = self.get_checkpoint_distance();
-
-        // Find the top of the historical layers
-        let end_lsn = {
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map();
-
-            let l0_deltas = layers.get_level0_deltas()?;
-            drop(guard);
-
-            // As an optimization, if we find that there are too few L0 layers,
-            // bail out early. We know that the compaction algorithm would do
-            // nothing in that case.
-            if l0_deltas.len() < fanout as usize {
-                // doesn't need compacting
-                return Ok(());
-            }
-            l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
-        };
-
-        // now lock out layer removal (compaction, gc, timeline deletion)
-        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
-        // Is the timeline being deleted?
-        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
-        }
-
-        let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
-        let mut adaptor = TimelineAdaptor::new(self, layer_removal_cs, (end_lsn, keyspace));
-        let ctx_adaptor = RequestContextAdaptor(ctx.clone());
-
-        pageserver_compaction::compact_tiered::compact_tiered(
-            &mut adaptor,
-            end_lsn,
-            target_file_size,
-            fanout,
-            &ctx_adaptor,
-        )
-        .await?;
-
-        adaptor.flush_updates().await?;
-        Ok(())
-    }
-}
-
-struct TimelineAdaptor {
-    timeline: Arc<Timeline>,
-    layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-
-    keyspace: (Lsn, KeySpace),
-
-    new_deltas: Vec<ResidentLayer>,
-    new_images: Vec<ResidentLayer>,
-    layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
-}
-
-impl TimelineAdaptor {
-    pub fn new(
-        timeline: &Arc<Timeline>,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        keyspace: (Lsn, KeySpace),
-    ) -> Self {
-        Self {
-            timeline: timeline.clone(),
-            layer_removal_cs,
-            keyspace,
-            new_images: Vec::new(),
-            new_deltas: Vec::new(),
-            layers_to_delete: Vec::new(),
-        }
-    }
-
-    pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
-        let layers_to_delete = {
-            let guard = self.timeline.layers.read().await;
-            self.layers_to_delete
-                .iter()
-                .map(|x| guard.get_from_desc(x))
-                .collect::<Vec<Layer>>()
-        };
-        self.timeline
-            .finish_compact_batch(
-                self.layer_removal_cs.clone(),
-                &self.new_deltas,
-                &self.new_images,
-                &layers_to_delete,
-            )
-            .await?;
-        self.new_images.clear();
-        self.new_deltas.clear();
-        self.layers_to_delete.clear();
-        Ok(())
-    }
-}
-
-#[derive(Clone)]
-struct ResidentDeltaLayer(ResidentLayer);
-#[derive(Clone)]
-struct ResidentImageLayer(ResidentLayer);
-
-#[async_trait]
-impl CompactionJobExecutor for TimelineAdaptor {
-    type Key = crate::repository::Key;
-
-    type Layer = Arc<PersistentLayerDesc>;
-    type DeltaLayer = ResidentDeltaLayer;
-    type ImageLayer = ResidentImageLayer;
-
-    type RequestContext = RequestContextAdaptor;
-
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Key>,
-        lsn_range: &Range<Lsn>,
-        _ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<Arc<PersistentLayerDesc>>> {
-        self.flush_updates().await?;
-
-        let guard = self.timeline.layers.read().await;
-        let layer_map = guard.layer_map();
-
-        let result = layer_map
-            .iter_historic_layers()
-            .filter(|l| {
-                overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
-            })
-            .collect();
-        Ok(result)
-    }
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Key>,
-        lsn: Lsn,
-        _ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<Range<Key>>> {
-        if lsn == self.keyspace.0 {
-            Ok(pageserver_compaction::helpers::intersect_keyspace(
-                &self.keyspace.1.ranges,
-                key_range,
-            ))
-        } else {
-            // The current compaction implementatin only ever requests the key space
-            // at the compaction end LSN.
-            anyhow::bail!("keyspace not available for requested lsn");
-        }
-    }
-
-    async fn downcast_delta_layer(
-        &self,
-        layer: &Arc<PersistentLayerDesc>,
-    ) -> anyhow::Result<Option<ResidentDeltaLayer>> {
-        // this is a lot more complex than a simple downcast...
-        if layer.is_delta() {
-            let l = {
-                let guard = self.timeline.layers.read().await;
-                guard.get_from_desc(layer)
-            };
-            let result = l.download_and_keep_resident().await?;
-
-            Ok(Some(ResidentDeltaLayer(result)))
-        } else {
-            Ok(None)
-        }
-    }
-
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<()> {
-        Ok(self.create_image_impl(lsn, key_range, ctx).await?)
-    }
-
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Key>,
-        input_layers: &[ResidentDeltaLayer],
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<()> {
-        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-
-        let mut all_entries = Vec::new();
-        for dl in input_layers.iter() {
-            all_entries.extend(dl.load_keys(ctx).await?);
-        }
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        let mut writer = DeltaLayerWriter::new(
-            self.timeline.conf,
-            self.timeline.timeline_id,
-            self.timeline.tenant_id,
-            key_range.start,
-            lsn_range.clone(),
-        )
-        .await?;
-
-        let mut dup_values = 0;
-
-        // This iterator walks through all key-value pairs from all the layers
-        // we're compacting, in key, LSN order.
-        let mut prev: Option<(Key, Lsn)> = None;
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_entries.iter()
-        {
-            if prev == Some((key, lsn)) {
-                // This is a duplicate. Skip it.
-                //
-                // It can happen if compaction is interrupted after writing some
-                // layers but not all, and we are compacting the range again.
-                // The calculations in the algorithm assume that there are no
-                // duplicates, so the math on targeted file size is likely off,
-                // and we will create smaller files than expected.
-                dup_values += 1;
-                continue;
-            }
-
-            let value = val.load(ctx).await?;
-
-            writer.put_value(key, lsn, value).await?;
-
-            prev = Some((key, lsn));
-        }
-
-        if dup_values > 0 {
-            warn!("delta layer created with {} duplicate values", dup_values);
-        }
-
-        fail_point!("delta-layer-writer-fail-before-finish", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint delta-layer-writer-fail-before-finish"
-            ))
-        });
-
-        let new_delta_layer = writer
-            .finish(prev.unwrap().0.next(), &self.timeline)
-            .await?;
-
-        self.new_deltas.push(new_delta_layer);
-        Ok(())
-    }
-
-    async fn delete_layer(
-        &mut self,
-        layer: &Arc<PersistentLayerDesc>,
-        _ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<()> {
-        self.layers_to_delete.push(layer.clone());
-        Ok(())
-    }
-}
-
-impl TimelineAdaptor {
-    async fn create_image_impl(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &RequestContextAdaptor,
-    ) -> Result<(), PageReconstructError> {
-        let timer = self.timeline.metrics.create_images_time_histo.start_timer();
-
-        let mut image_layer_writer = ImageLayerWriter::new(
-            self.timeline.conf,
-            self.timeline.timeline_id,
-            self.timeline.tenant_id,
-            key_range,
-            lsn,
-        )
-        .await?;
-
-        fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(PageReconstructError::Other(anyhow::anyhow!(
-                "failpoint image-layer-writer-fail-before-finish"
-            )))
-        });
-        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
-        for range in &keyspace_ranges {
-            let mut key = range.start;
-            while key < range.end {
-                let img = match self.timeline.get(key, lsn, ctx).await {
-                    Ok(img) => img,
-                    Err(err) => {
-                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                        // page without losing any actual user data. That seems better
-                        // than failing repeatedly and getting stuck.
-                        //
-                        // We had a bug at one point, where we truncated the FSM and VM
-                        // in the pageserver, but the Postgres didn't know about that
-                        // and continued to generate incremental WAL records for pages
-                        // that didn't exist in the pageserver. Trying to replay those
-                        // WAL records failed to find the previous image of the page.
-                        // This special case allows us to recover from that situation.
-                        // See https://github.com/neondatabase/neon/issues/2601.
-                        //
-                        // Unfortunately we cannot do this for the main fork, or for
-                        // any metadata keys, keys, as that would lead to actual data
-                        // loss.
-                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                            ZERO_PAGE.clone()
-                        } else {
-                            return Err(err);
-                        }
-                    }
-                };
-                image_layer_writer.put_image(key, &img).await?;
-                key = key.next();
-            }
-        }
-        let image_layer = image_layer_writer.finish(&self.timeline).await?;
-
-        self.new_images.push(image_layer);
-
-        timer.stop_and_record();
-
-        Ok(())
-    }
-}
-
-pub struct RequestContextAdaptor(pub RequestContext);
-
-impl std::ops::Deref for RequestContextAdaptor {
-    type Target = RequestContext;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl CompactionRequestContext for RequestContextAdaptor {}
-
-impl CompactionLayer<Key> for Arc<PersistentLayerDesc> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-    fn short_id(&self) -> std::string::String {
-        self.as_ref().short_id().to_string()
-    }
-    fn is_delta(&self) -> bool {
-        self.as_ref().is_delta()
-    }
-}
-
-impl CompactionLayer<Key> for Arc<DeltaLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.layer_desc().key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.layer_desc().lsn_range
-    }
-    fn file_size(&self) -> u64 {
-        self.layer_desc().file_size
-    }
-    fn short_id(&self) -> std::string::String {
-        self.layer_desc().short_id().to_string()
-    }
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-use crate::tenant::timeline::DeltaEntry;
-
-impl CompactionLayer<Key> for ResidentDeltaLayer {
-    fn key_range(&self) -> &Range<Key> {
-        &self.0.layer_desc().key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.0.layer_desc().lsn_range
-    }
-    fn file_size(&self) -> u64 {
-        self.0.layer_desc().file_size
-    }
-    fn short_id(&self) -> std::string::String {
-        self.0.layer_desc().short_id().to_string()
-    }
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-#[async_trait]
-impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
-    type DeltaEntry<'a> = DeltaEntry<'a>;
-
-    async fn load_keys<'a>(
-        &self,
-        ctx: &RequestContextAdaptor,
-    ) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
-        self.0.load_keys(ctx).await
-    }
-}
-
-impl CompactionLayer<Key> for ResidentImageLayer {
-    fn key_range(&self) -> &Range<Key> {
-        &self.0.layer_desc().key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.0.layer_desc().lsn_range
-    }
-    fn file_size(&self) -> u64 {
-        self.0.layer_desc().file_size
-    }
-    fn short_id(&self) -> std::string::String {
-        self.0.layer_desc().short_id().to_string()
-    }
-    fn is_delta(&self) -> bool {
-        false
-    }
-}
-impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -459,7 +459,7 @@ async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {

    // extract the row contents into an IdentifySystem struct.
    // written as a closure so I can use ? for Option here.
-    if let Some(SimpleQueryMessage::Row(first_row)) = response.first() {
+    if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
        Ok(IdentifySystem {
            systemid: get_parse(first_row, 0)?,
            timeline: get_parse(first_row, 1)?,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
-use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -174,78 +173,37 @@ impl OpenFiles {
    }
 }

-/// Identify error types that should alwways terminate the process.  Other
-/// error types may be elegible for retry.
-pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
-    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
-        Some(EIO) => {
-            // Terminate on EIO because we no longer trust the device to store
-            // data safely, or to uphold persistence guarantees on fsync.
-            true
-        }
-        Some(EROFS) => {
-            // Terminate on EROFS because a filesystem is usually remounted
-            // readonly when it has experienced some critical issue, so the same
-            // logic as EIO applies.
-            true
-        }
-        Some(EACCES) => {
-            // Terminate on EACCESS because we should always have permissions
-            // for our own data dir: if we don't, then we can't do our job and
-            // need administrative intervention to fix permissions.  Terminating
-            // is the best way to make sure we stop cleanly rather than going
-            // into infinite retry loops, and will make it clear to the outside
-            // world that we need help.
-            true
-        }
-        _ => {
-            // Treat all other local file I/O errors are retryable.  This includes:
-            // - ENOSPC: we stay up and wait for eviction to free some space
-            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
-            // - WriteZero, Interrupted: these are used internally VirtualFile
-            false
-        }
-    }
+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
 }
-
-/// Call this when the local filesystem gives us an error with an external
-/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
-/// bad storage or bad configuration, and we can't fix that from inside
-/// a running process.
-pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
-    std::process::abort();
-}
-
-pub(crate) trait MaybeFatalIo<T> {
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
-    fn fatal_err(self, context: &str) -> T;
-}
-
-impl<T> MaybeFatalIo<T> for std::io::Result<T> {
-    /// Terminate the process if the result is an error of a fatal type, else pass it through
-    ///
-    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
-    /// not on ENOSPC.
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
-        if let Err(e) = &self {
-            if is_fatal_io_error(e) {
-                on_fatal_io_error(e, context);
-            }
-        }
-        self
-    }
-
-    /// Terminate the process on any I/O error.
-    ///
-    /// This is appropriate for reads on files that we know exist: they should always work.
-    fn fatal_err(self, context: &str) -> T {
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
        match self {
-            Ok(v) => v,
-            Err(e) => {
-                on_fatal_io_error(&e, context);
-            }
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
        }
    }
 }
@@ -326,13 +284,15 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> std::io::Result<()> {
+    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -341,20 +301,31 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await?;
-        file.write_all(content).await?;
-        file.sync_all().await?;
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
        Ok(())
    }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -857,8 +857,7 @@ impl WalRedoProcess {
            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
+            } else if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
            }
@@ -908,8 +907,7 @@ impl WalRedoProcess {
                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
+                } else if out_revents.contains(PollFlags::POLLHUP) {
                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
                }
            }
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -88,7 +88,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
 static void XLogBroadcastWalProposer(WalProposer *wp);

-static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
+static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

 static void
@@ -1241,7 +1241,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;

 				/* write WAL to disk */
-				XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);

 				ereport(DEBUG1,
 						(errmsg("Recover message %X/%X length %d",
@@ -1283,24 +1283,11 @@ static XLogSegNo walpropSegNo = 0;
 * Write XLOG data to disk.
 */
 static void
-XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
 {
 	int			startoff;
 	int			byteswritten;

-	/*
-	 * Apart from walproposer, basebackup LSN page is also written out by
-	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
-	 * here to avoid dummy page overwriting correct one we download here. Ugly,
-	 * but alternatives are about the same ugly. We won't need that if we switch
-	 * to on-demand WAL download from safekeepers, without writing to disk.
-	 *
-	 * https://github.com/neondatabase/neon/issues/5749
-	 */
-	if (!wp->config->syncSafekeepers)
-		XLogUpdateWalBuffers(buf, recptr, nbytes);
-
 	while (nbytes > 0)
 	{
 		int			segbytes;
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -13,7 +13,6 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -188,31 +187,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn parse_wake_compute() -> anyhow::Result<()> {
-        let json = json!({
-            "address": "0.0.0.0",
-            "aux": dummy_aux(),
-        });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
-        Ok(())
-    }
-
-    #[test]
-    fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-            "allowed_ips": ["8.8.8.8"],
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-
-        Ok(())
-    }
 }
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -59,7 +59,7 @@ impl Api {
            let rows = client.query(query, &[&creds.user]).await?;

            // We can get at most one row, because `rolname` is unique.
-            let row = match rows.first() {
+            let row = match rows.get(0) {
                Some(row) => row,
                // This means that the user doesn't exist, so there can be no secret.
                // However, this is still a *valid* outcome which is very similar
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -49,7 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -18,6 +18,7 @@ mod password;
 pub use exchange::Exchange;
 pub use key::ScramKey;
 pub use secret::ServerSecret;
+pub use secret::*;

 use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -470,26 +470,30 @@ async fn query_to_json<T: GenericClient>(
    }
    .and_then(|s| s.parse::<i64>().ok());

-    let mut fields = vec![];
-    let mut columns = vec![];
-
-    for c in row_stream.columns() {
-        fields.push(json!({
-            "name": Value::String(c.name().to_owned()),
-            "dataTypeID": Value::Number(c.type_().oid().into()),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
-        columns.push(client.get_type(c.type_oid()).await?);
-    }
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                    "tableID": c.table_oid(),
+                    "columnID": c.column_id(),
+                    "dataTypeSize": c.type_size(),
+                    "dataTypeModifier": c.type_modifier(),
+                    "format": "text",
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };

    // convert rows to JSON
    let rows = rows
        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
@@ -510,28 +514,22 @@ async fn query_to_json<T: GenericClient>(
 //
 pub fn pg_text_row_to_json(
    row: &Row,
-    columns: &[Type],
    raw_output: bool,
    array_mode: bool,
 ) -> Result<Value, anyhow::Error> {
-    let iter = row
-        .columns()
-        .iter()
-        .zip(columns)
-        .enumerate()
-        .map(|(i, (column, typ))| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = if raw_output {
-                match pg_value {
-                    Some(v) => Value::String(v.to_string()),
-                    None => Value::Null,
-                }
-            } else {
-                pg_text_to_json(pg_value, typ)?
-            };
-            Ok((name.to_string(), json_value))
-        });
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
+        let name = column.name();
+        let pg_value = row.as_text(i)?;
+        let json_value = if raw_output {
+            match pg_value {
+                Some(v) => Value::String(v.to_string()),
+                None => Value::Null,
+            }
+        } else {
+            pg_text_to_json(pg_value, column.type_())?
+        };
+        Ok((name.to_string(), json_value))
+    });

    if array_mode {
        // drop keys and aggregate into array
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -33,7 +33,6 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
 aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }

 pageserver = { path = "../pageserver" }
-remote_storage = { path = "../libs/remote_storage" }

 tracing.workspace = true
 tracing-subscriber.workspace = true
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,18 +1,13 @@
 use std::collections::HashSet;

 use anyhow::Context;
-use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use aws_sdk_s3::Client;
 use tracing::{error, info, warn};
-use utils::generation::Generation;

 use crate::cloud_admin_api::BranchData;
-use crate::metadata_stream::stream_listing;
-use crate::{download_object_with_retries, RootTarget};
-use futures_util::{pin_mut, StreamExt};
-use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
+use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
-use remote_storage::RemotePath;
 use utils::id::TenantTimelineId;

 pub(crate) struct TimelineAnalysis {
@@ -73,7 +68,6 @@ pub(crate) async fn branch_cleanup_and_check_errors(
            match s3_data.blob_data {
                BlobDataParseResult::Parsed {
                    index_part,
-                    index_part_generation,
                    mut s3_layers,
                } => {
                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
@@ -113,62 +107,33 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                        ))
                        }

-                        let layer_map_key = (layer, metadata.generation);
-                        if !s3_layers.remove(&layer_map_key) {
-                            // FIXME: this will emit false positives if an index was
-                            // uploaded concurrently with our scan.  To make this check
-                            // correct, we need to try sending a HEAD request for the
-                            // layer we think is missing.
+                        if !s3_layers.remove(&layer) {
                            result.errors.push(format!(
-                                "index_part.json contains a layer {}{} that is not present in remote storage",
-                                layer_map_key.0.file_name(),
-                                layer_map_key.1.get_suffix()
+                                "index_part.json contains a layer {} that is not present in S3",
+                                layer.file_name(),
                            ))
                        }
                    }

-                    let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
-                        .into_iter()
-                        .filter(|(_layer_name, gen)|
-                            // A layer is only considered orphaned if it has a generation below
-                            // the index.  If the generation is >= the index, then the layer may
-                            // be an upload from a running pageserver, or even an upload from
-                            // a new generation that didn't upload an index yet.
-                            //
-                            // Even so, a layer that is not referenced by the index could just
-                            // be something enqueued for deletion, so while this check is valid 
-                            // for indicating that a layer is garbage, it is not an indicator
-                            // of a problem.
-                            gen < &index_part_generation)
-                        .collect();
-
-                    if !orphan_layers.is_empty() {
+                    if !s3_layers.is_empty() {
                        result.errors.push(format!(
                            "index_part.json does not contain layers from S3: {:?}",
-                            orphan_layers
+                            s3_layers
                                .iter()
-                                .map(|(layer_name, gen)| format!(
-                                    "{}{}",
-                                    layer_name.file_name(),
-                                    gen.get_suffix()
-                                ))
+                                .map(|layer_name| layer_name.file_name())
                                .collect::<Vec<_>>(),
                        ));
-                        result.garbage_keys.extend(orphan_layers.iter().map(
-                            |(layer_name, layer_gen)| {
+                        result
+                            .garbage_keys
+                            .extend(s3_layers.iter().map(|layer_name| {
                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
                                let delimiter = s3_root.delimiter();
                                if !key.ends_with(delimiter) {
                                    key.push_str(delimiter);
                                }
-                                key.push_str(&format!(
-                                    "{}{}",
-                                    &layer_name.file_name(),
-                                    layer_gen.get_suffix()
-                                ));
+                                key.push_str(&layer_name.file_name());
                                key
-                            },
-                        ));
+                            }));
                    }
                }
                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -213,96 +178,69 @@ pub(crate) struct S3TimelineBlobData {
 pub(crate) enum BlobDataParseResult {
    Parsed {
        index_part: IndexPart,
-        index_part_generation: Generation,
-        s3_layers: HashSet<(LayerFileName, Generation)>,
+        s3_layers: HashSet<LayerFileName>,
    },
    Incorrect(Vec<String>),
 }

-fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
-    match name.rsplit_once('-') {
-        // FIXME: this is gross, just use a regex?
-        Some((layer_filename, gen)) if gen.len() == 8 => {
-            let layer = layer_filename.parse::<LayerFileName>()?;
-            let gen =
-                Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
-            Ok((layer, gen))
-        }
-        _ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
-    }
-}
-
 pub(crate) async fn list_timeline_blobs(
    s3_client: &Client,
    id: TenantTimelineId,
    s3_root: &RootTarget,
 ) -> anyhow::Result<S3TimelineBlobData> {
    let mut s3_layers = HashSet::new();
+    let mut index_part_object = None;
+
+    let timeline_dir_target = s3_root.timeline_root(&id);
+    let mut continuation_token = None;

    let mut errors = Vec::new();
    let mut keys_to_remove = Vec::new();

-    let mut timeline_dir_target = s3_root.timeline_root(&id);
-    timeline_dir_target.delimiter = String::new();
+    loop {
+        let fetch_response =
+            list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone())
+                .await?;

-    let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
+        let subdirectories = fetch_response.common_prefixes().unwrap_or_default();
+        if !subdirectories.is_empty() {
+            errors.push(format!(
+                "S3 list response should not contain any subdirectories, but got {subdirectories:?}"
+            ));
+        }

-    let stream = stream_listing(s3_client, &timeline_dir_target);
-    pin_mut!(stream);
-    while let Some(obj) = stream.next().await {
-        let obj = obj?;
-        let key = match obj.key() {
-            Some(k) => k,
-            None => continue,
-        };
-
-        let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
-        match blob_name {
-            Some(name) if name.starts_with("index_part.json") => {
-                tracing::info!("Index key {key}");
-                index_parts.push(obj)
-            }
-            Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
-                Ok((new_layer, gen)) => {
-                    tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
-                    s3_layers.insert((new_layer, gen));
-                }
-                Err(e) => {
-                    tracing::info!("Error parsing key {maybe_layer_name}");
-                    errors.push(
-                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
-                    );
+        for (object, key) in fetch_response
+            .contents()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|object| Some((object, object.key()?)))
+        {
+            let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+            match blob_name {
+                Some("index_part.json") => index_part_object = Some(object.clone()),
+                Some(maybe_layer_name) => match maybe_layer_name.parse::<LayerFileName>() {
+                    Ok(new_layer) => {
+                        s3_layers.insert(new_layer);
+                    }
+                    Err(e) => {
+                        errors.push(
+                            format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
+                        );
+                        keys_to_remove.push(key.to_string());
+                    }
+                },
+                None => {
+                    errors.push(format!("S3 list response got an object with odd key {key}"));
                    keys_to_remove.push(key.to_string());
                }
-            },
-            None => {
-                tracing::info!("Peculiar key {}", key);
-                errors.push(format!("S3 list response got an object with odd key {key}"));
-                keys_to_remove.push(key.to_string());
            }
        }
-    }

-    // Choose the index_part with the highest generation
-    let (index_part_object, index_part_generation) = match index_parts
-        .iter()
-        .filter_map(|k| {
-            let key = k.key().unwrap();
-            // Stripping the index key to the last part, because RemotePath doesn't
-            // like absolute paths, and depending on prefix_in_bucket it's possible
-            // for the keys we read back to start with a slash.
-            let basename = key.rsplit_once('/').unwrap().1;
-            parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
-        })
-        .max_by_key(|i| i.1)
-        .map(|(k, g)| (k.clone(), g))
-    {
-        Some((key, gen)) => (Some(key), gen),
-        None => {
-            // Legacy/missing case: one or zero index parts, which did not have a generation
-            (index_parts.pop(), Generation::none())
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
        }
-    };
+    }

    if index_part_object.is_none() {
        errors.push("S3 list response got no index_part.json file".to_string());
@@ -323,7 +261,6 @@ pub(crate) async fn list_timeline_blobs(
                return Ok(S3TimelineBlobData {
                    blob_data: BlobDataParseResult::Parsed {
                        index_part,
-                        index_part_generation,
                        s3_layers,
                    },
                    keys_to_remove,
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -34,9 +34,6 @@ const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
 #[derive(Debug, Clone)]
 pub struct S3Target {
    pub bucket_name: String,
-    /// This `prefix_in_bucket` is only equal to the PS/SK config of the same
-    /// name for the RootTarget: other instances of S3Target will have prefix_in_bucket
-    /// with extra parts.
    pub prefix_in_bucket: String,
    pub delimiter: String,
 }
@@ -80,13 +77,9 @@ impl Display for NodeKind {
 impl S3Target {
    pub fn with_sub_segment(&self, new_segment: &str) -> Self {
        let mut new_self = self.clone();
-        if new_self.prefix_in_bucket.is_empty() {
-            new_self.prefix_in_bucket = format!("/{}/", new_segment);
-        } else {
-            let _ = new_self.prefix_in_bucket.pop();
-            new_self.prefix_in_bucket =
-                [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
-        }
+        let _ = new_self.prefix_in_bucket.pop();
+        new_self.prefix_in_bucket =
+            [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
        new_self
    }
 }
@@ -98,10 +91,10 @@ pub enum RootTarget {
 }

 impl RootTarget {
-    pub fn tenants_root(&self) -> S3Target {
+    pub fn tenants_root(&self) -> &S3Target {
        match self {
-            Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
-            Self::Safekeeper(root) => root.with_sub_segment("wal"),
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
        }
    }

@@ -140,7 +133,6 @@ impl RootTarget {
 pub struct BucketConfig {
    pub region: String,
    pub bucket: String,
-    pub prefix_in_bucket: Option<String>,

    /// Use SSO if this is set, else rely on AWS_* environment vars
    pub sso_account_id: Option<String>,
@@ -163,12 +155,10 @@ impl BucketConfig {
        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
        let region = env::var("REGION").context("'REGION' param retrieval")?;
        let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
-        let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();

        Ok(Self {
            region,
            bucket,
-            prefix_in_bucket,
            sso_account_id,
        })
    }
@@ -201,14 +191,14 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
        .with_target(false)
        .with_ansi(false)
        .with_writer(file_writer);
-    let stderr_logs = fmt::Layer::new()
-        .with_ansi(std::io::stderr().is_terminal())
+    let stdout_logs = fmt::Layer::new()
+        .with_ansi(std::io::stdout().is_terminal())
        .with_target(false)
-        .with_writer(std::io::stderr);
+        .with_writer(std::io::stdout);
    tracing_subscriber::registry()
        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
        .with(file_logs)
-        .with(stderr_logs)
+        .with(stdout_logs)
        .init();

    guard
@@ -260,20 +250,15 @@ fn init_remote(
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
-
    let s3_root = match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("pageserver/v1".to_string()),
+            prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(&delimiter),
            delimiter,
        }),
        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: bucket_config
-                .prefix_in_bucket
-                .unwrap_or("safekeeper/v1".to_string()),
+            prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
            delimiter,
        }),
    };
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -31,10 +31,7 @@ enum Command {
        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
        mode: PurgeMode,
    },
-    ScanMetadata {
-        #[arg(short, long, default_value_t = false)]
-        json: bool,
-    },
+    ScanMetadata {},
 }

 #[tokio::main]
@@ -57,17 +54,13 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
+        Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
            Err(e) => {
                tracing::error!("Failed: {e}");
                Err(e)
            }
            Ok(summary) => {
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
-                }
+                println!("{}", summary.summary_string());
                if summary.is_fatal() {
                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
                } else {
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -13,10 +13,10 @@ pub fn stream_tenants<'a>(
 ) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
    try_stream! {
        let mut continuation_token = None;
-        let tenants_target = target.tenants_root();
        loop {
+            let tenants_target = target.tenants_root();
            let fetch_response =
-                list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
+                list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;

            let new_entry_ids = fetch_response
                .common_prefixes()
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -10,10 +10,8 @@ use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
 use pageserver::tenant::IndexPart;
-use serde::Serialize;
 use utils::id::TenantTimelineId;

-#[derive(Serialize)]
 pub struct MetadataSummary {
    count: usize,
    with_errors: HashSet<TenantTimelineId>,
@@ -27,9 +25,7 @@ pub struct MetadataSummary {
 }

 /// A histogram plus minimum and maximum tracking
-#[derive(Serialize)]
 struct MinMaxHisto {
-    #[serde(skip)]
    histo: Histogram,
    min: u64,
    max: u64,
@@ -113,7 +109,6 @@ impl MetadataSummary {
        self.count += 1;
        if let BlobDataParseResult::Parsed {
            index_part,
-            index_part_generation: _,
            s3_layers: _,
        } = &data.blob_data
        {
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -47,7 +47,6 @@ pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
 storage_broker.workspace = true
-tokio-stream.workspace = true
 utils.workspace = true

 workspace_hack.workspace = true
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -5,7 +5,6 @@ use std::fs::DirEntry;
 use std::io::BufReader;
 use std::io::Read;
 use std::path::PathBuf;
-use std::sync::Arc;

 use anyhow::Result;
 use camino::Utf8Path;
@@ -29,7 +28,7 @@ use crate::send_wal::WalSenderState;
 use crate::GlobalTimelines;

 /// Various filters that influence the resulting JSON output.
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct Args {
    /// Dump all available safekeeper state. False by default.
    pub dump_all: bool,
@@ -54,76 +53,15 @@ pub struct Args {
 }

 /// Response for debug dump request.
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct Response {
    pub start_time: DateTime<Utc>,
    pub finish_time: DateTime<Utc>,
-    pub timelines: Vec<TimelineDumpSer>,
+    pub timelines: Vec<Timeline>,
    pub timelines_count: usize,
    pub config: Config,
 }

-pub struct TimelineDumpSer {
-    pub tli: Arc<crate::timeline::Timeline>,
-    pub args: Args,
-    pub runtime: Arc<tokio::runtime::Runtime>,
-}
-
-impl std::fmt::Debug for TimelineDumpSer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("TimelineDumpSer")
-            .field("tli", &self.tli.ttid)
-            .field("args", &self.args)
-            .finish()
-    }
-}
-
-impl Serialize for TimelineDumpSer {
-    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
-        dump.serialize(serializer)
-    }
-}
-
-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
-    let control_file = if args.dump_control_file {
-        let mut state = timeline.get_state().await.1;
-        if !args.dump_term_history {
-            state.acceptor_state.term_history = TermHistory(vec![]);
-        }
-        Some(state)
-    } else {
-        None
-    };
-
-    let memory = if args.dump_memory {
-        Some(timeline.memory_dump().await)
-    } else {
-        None
-    };
-
-    let disk_content = if args.dump_disk_content {
-        // build_disk_content can fail, but we don't want to fail the whole
-        // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
-    } else {
-        None
-    };
-
-    Timeline {
-        tenant_id: timeline.ttid.tenant_id,
-        timeline_id: timeline.ttid.timeline_id,
-        control_file,
-        memory,
-        disk_content,
-    }
-}
-
 /// Safekeeper configuration.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Config {
@@ -202,12 +140,8 @@ pub async fn build(args: Args) -> Result<Response> {
        GlobalTimelines::get_all()
    };

+    // TODO: return Stream instead of Vec
    let mut timelines = Vec::new();
-    let runtime = Arc::new(
-        tokio::runtime::Builder::new_current_thread()
-            .build()
-            .unwrap(),
-    );
    for tli in ptrs_snapshot {
        let ttid = tli.ttid;
        if let Some(tenant_id) = args.tenant_id {
@@ -221,11 +155,38 @@ pub async fn build(args: Args) -> Result<Response> {
            }
        }

-        timelines.push(TimelineDumpSer {
-            tli,
-            args: args.clone(),
-            runtime: runtime.clone(),
-        });
+        let control_file = if args.dump_control_file {
+            let mut state = tli.get_state().await.1;
+            if !args.dump_term_history {
+                state.acceptor_state.term_history = TermHistory(vec![]);
+            }
+            Some(state)
+        } else {
+            None
+        };
+
+        let memory = if args.dump_memory {
+            Some(tli.memory_dump().await)
+        } else {
+            None
+        };
+
+        let disk_content = if args.dump_disk_content {
+            // build_disk_content can fail, but we don't want to fail the whole
+            // request because of that.
+            build_disk_content(&tli.timeline_dir).ok()
+        } else {
+            None
+        };
+
+        let timeline = Timeline {
+            tenant_id: ttid.tenant_id,
+            timeline_id: ttid.timeline_id,
+            control_file,
+            memory,
+            disk_content,
+        };
+        timelines.push(timeline);
    }

    let config = GlobalTimelines::get_global_config();
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -13,12 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
-
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-use tracing::info_span;
-use utils::http::endpoint::{request_span, ChannelWriter};
+use utils::http::endpoint::request_span;

 use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::Term;
@@ -378,52 +373,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
        .await
        .map_err(ApiError::InternalServerError)?;

-    let started_at = std::time::Instant::now();
-
-    let (tx, rx) = mpsc::channel(1);
-
-    let body = Body::wrap_stream(ReceiverStream::new(rx));
-
-    let mut writer = ChannelWriter::new(128 * 1024, tx);
-
-    let response = Response::builder()
-        .status(200)
-        .header(hyper::header::CONTENT_TYPE, "application/octet-stream")
-        .body(body)
-        .unwrap();
-
-    let span = info_span!("blocking");
-    tokio::task::spawn_blocking(move || {
-        let _span = span.entered();
-
-        let res = serde_json::to_writer(&mut writer, &resp)
-            .map_err(std::io::Error::from)
-            .and_then(|_| writer.flush());
-
-        match res {
-            Ok(()) => {
-                tracing::info!(
-                    bytes = writer.flushed_bytes(),
-                    elapsed_ms = started_at.elapsed().as_millis(),
-                    "responded /v1/debug_dump"
-                );
-            }
-            Err(e) => {
-                tracing::warn!("failed to write out /v1/debug_dump response: {e:#}");
-                // semantics of this error are quite... unclear. we want to error the stream out to
-                // abort the response to somehow notify the client that we failed.
-                //
-                // though, most likely the reason for failure is that the receiver is already gone.
-                drop(
-                    writer
-                        .tx
-                        .blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
-                );
-            }
-        }
-    });
-
-    Ok(response)
+    // TODO: use streaming response
+    json_response(StatusCode::OK, resp)
 }

 /// Safekeeper http router.
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,4 +1,3 @@
-use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};

 use anyhow::{bail, Context, Result};
@@ -33,16 +32,6 @@ pub struct Response {
    // TODO: add more fields?
 }

-/// Response for debug dump request.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct DebugDumpResponse {
-    pub start_time: DateTime<Utc>,
-    pub finish_time: DateTime<Utc>,
-    pub timelines: Vec<debug_dump::Timeline>,
-    pub timelines_count: usize,
-    pub config: debug_dump::Config,
-}
-
 /// Find the most advanced safekeeper and pull timeline from it.
 pub async fn handle_request(request: Request) -> Result<Response> {
    let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
@@ -114,7 +103,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>

    // Implementing our own scp over HTTP.
    // At first, we need to fetch list of files from safekeeper.
-    let dump: DebugDumpResponse = client
+    let dump: debug_dump::Response = client
        .get(format!(
            "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
            host, status.tenant_id, status.timeline_id
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -81,6 +81,7 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
+    "test_runner/performance/test_startup.py::test_startup": 890.114,
    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2868,7 +2868,7 @@ class SafekeeperHttpClient(requests.Session):
        params = params or {}
        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
        res.raise_for_status()
-        res_json = json.loads(res.text)
+        res_json = res.json()
        assert isinstance(res_json, dict)
        return res_json

@@ -2968,33 +2968,24 @@ class S3Scrubber:
        self.env = env
        self.log_dir = log_dir

-    def scrubber_cli(self, args: list[str], timeout) -> str:
+    def scrubber_cli(self, args, timeout):
        assert isinstance(self.env.pageserver_remote_storage, S3Storage)
        s3_storage = self.env.pageserver_remote_storage

        env = {
            "REGION": s3_storage.bucket_region,
            "BUCKET": s3_storage.bucket_name,
-            "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
-            "RUST_LOG": "DEBUG",
        }
        env.update(s3_storage.access_env_vars())

        if s3_storage.endpoint is not None:
            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})

-        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
+        base_args = [self.env.neon_binpath / "s3_scrubber"]
        args = base_args + args

-        (output_path, stdout, status_code) = subprocess_capture(
-            self.log_dir,
-            args,
-            echo_stderr=True,
-            echo_stdout=True,
-            env=env,
-            check=False,
-            capture_stdout=True,
-            timeout=timeout,
+        (output_path, _, status_code) = subprocess_capture(
+            self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
        )
        if status_code:
            log.warning(f"Scrub command {args} failed")
@@ -3003,18 +2994,8 @@ class S3Scrubber:

            raise RuntimeError("Remote storage scrub failed")

-        assert stdout is not None
-        return stdout
-
-    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
-
-        try:
-            return json.loads(stdout)
-        except:
-            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
-            log.error(stdout)
-            raise
+    def scan_metadata(self):
+        self.scrubber_cli(["scan-metadata"], timeout=30)


 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in the wild by tests with the below contradicting logging
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
            # this seems like a mock_s3 issue
-            log.warning(
+            log.warn(
                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
            )
            keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in one case with mock_s3:
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warning(
+            log.warn(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -35,7 +35,6 @@ def subprocess_capture(
    echo_stderr=False,
    echo_stdout=False,
    capture_stdout=False,
-    timeout=None,
    **kwargs: Any,
 ) -> Tuple[str, Optional[str], int]:
    """Run a process and bifurcate its output to files and the `log` logger
@@ -105,7 +104,7 @@ def subprocess_capture(
                stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
                stderr_handler.start()

-                r = p.wait(timeout=timeout)
+                r = p.wait()

                stdout_handler.join()
                stderr_handler.join()
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,10 +1,8 @@
 from contextlib import closing

-from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn


 #
@@ -20,8 +18,6 @@ from fixtures.types import Lsn
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

-    start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
-
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table huge (i int, j int);")
@@ -35,13 +31,6 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
            env.report_peak_memory_use()
            env.report_size()

-    # Report amount of wal written. Useful for comparing vanilla wal format vs
-    # neon wal format, measuring neon write amplification, etc.
-    end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
-    wal_written_bytes = end_lsn - start_lsn
-    wal_written_mb = round(wal_written_bytes / (1024 * 1024))
-    env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
-
    # When testing neon, also check how long it takes the pageserver to reingest the
    # wal from safekeepers. If this number is close to total runtime, then the pageserver
    # is the bottleneck.
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,3 +1,6 @@
+from contextlib import closing
+
+import pytest
 import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
@@ -78,3 +81,49 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc

        # Imitate optimizations that console would do for the second start
        endpoint.respec(skip_pg_catalog_updates=True)
+
+
+# This test sometimes runs for longer than the global 5 minute timeout.
+@pytest.mark.timeout(900)
+def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    # Start
+    env.neon_cli.create_branch("test_startup")
+    with zenbenchmark.record_duration("startup_time"):
+        endpoint = env.endpoints.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Restart
+    endpoint.stop_and_destroy()
+    with zenbenchmark.record_duration("restart_time"):
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Fill up
+    num_rows = 1000000  # 30 MB
+    num_tables = 100
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            for i in range(num_tables):
+                cur.execute(f"create table t_{i} (i integer);")
+                cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
+
+    # Read
+    with zenbenchmark.record_duration("read_time"):
+        endpoint.safe_psql("select * from t_0;")
+
+    # Read again
+    with zenbenchmark.record_duration("second_read_time"):
+        endpoint.safe_psql("select * from t_0;")
+
+    # Restart
+    endpoint.stop_and_destroy()
+    with zenbenchmark.record_duration("restart_with_data"):
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")
+
+    # Read
+    with zenbenchmark.record_duration("read_after_restart"):
+        endpoint.safe_psql("select * from t_0;")
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -150,9 +150,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "compaction_target_size": 1048576,
        "checkpoint_distance": 10000,
        "checkpoint_timeout": "13m",
-        "compaction_algorithm": {
-            "kind": "Tiered",
-        },
        "eviction_policy": {
            "kind": "LayerAccessThreshold",
            "period": "20s",
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -72,7 +72,7 @@ class DdlForwardingContext:
        self.dbs: Dict[str, str] = {}
        self.roles: Dict[str, str] = {}
        self.fail = False
-        endpoint = "/test/roles_and_databases"
+        endpoint = "/management/api/v2/roles_and_databases"
        ddl_url = f"http://{host}:{port}{endpoint}"
        self.pg.configure(
            [
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,14 +1,11 @@
 import time

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    logical_replication_sync,
    wait_for_last_flush_lsn,
 )
-from fixtures.types import Lsn
-from fixtures.utils import query_scalar


 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
@@ -150,89 +147,3 @@ COMMIT;
    endpoint.start()
    # it must be gone (but walproposer slot still exists, hence 1)
    assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
-
-
-# Test compute start at LSN page of which starts with contrecord
-# https://github.com/neondatabase/neon/issues/5749
-def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    env.neon_cli.create_branch("init")
-    endpoint = env.endpoints.create_start("init")
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
-
-    cur = endpoint.connect().cursor()
-    cur.execute("create table t(key int, value text)")
-    cur.execute("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
-    cur.execute("insert into replication_example values (1, 2)")
-    cur.execute("create publication pub1 for table replication_example")
-
-    # now start subscriber
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, value text)")
-    vanilla_pg.safe_psql("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
-
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    connstr = endpoint.connstr().replace("'", "''")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-    logical_replication_sync(vanilla_pg, endpoint)
-    vanilla_pg.stop()
-
-    with endpoint.cursor() as cur:
-        # measure how much space logical message takes. Sometimes first attempt
-        # creates huge message and then it stabilizes, have no idea why.
-        for _ in range(3):
-            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            log.info(f"current_lsn={lsn_before}")
-            # Non-transactional logical message doesn't write WAL, only XLogInsert's
-            # it, so use transactional. Which is a bit problematic as transactional
-            # necessitates commit record. Alternatively we can do smth like
-            #   select neon_xlogflush(pg_current_wal_insert_lsn());
-            # but isn't much better + that particular call complains on 'xlog flush
-            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
-            # page headers.
-            payload = "blahblah"
-            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
-            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
-            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
-            log.info(
-                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
-            )
-
-        # and write logical message spanning exactly as we want
-        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"current_lsn={lsn_before}")
-        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        offs = int(curr_lsn) % 8192
-        till_page = 8192 - offs
-        payload_len = (
-            till_page - logical_message_base - 8
-        )  # not sure why 8 is here, it is deduced from experiments
-        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
-
-        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
-        payload_len += 8
-
-        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
-        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
-        # The calculations to hit the page boundary are very fuzzy, so just
-        # ignore test if we fail to reach it.
-        if not (int(supposedly_contrecord_end) % 8192 == 32):
-            pytest.skip("missed page boundary, bad luck")
-
-        cur.execute("insert into replication_example values (2, 3)")
-
-    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop().start()
-
-    cur = endpoint.connect().cursor()
-    # this should flush current wal page
-    cur.execute("insert into replication_example values (3, 4)")
-    vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
-    assert vanilla_pg.safe_psql(
-        "select sum(somedata) from replication_example"
-    ) == endpoint.safe_psql("select sum(somedata) from replication_example")
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -21,7 +21,6 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
-    S3Scrubber,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -235,22 +234,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    assert len(suffixed_objects) > 0
    assert len(legacy_objects) > 0

-    # Flush through deletions to get a clean state for scrub: we are implicitly validating
-    # that our generations-enabled pageserver was able to do deletions of layers
-    # from earlier which don't have a generation.
-    env.pageserver.http_client().deletion_queue_flush(execute=True)
-
    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0

-    # Having written a mixture of generation-aware and legacy index_part.json,
-    # ensure the scrubber handles the situation as expected.
-    metadata_summary = S3Scrubber(
-        neon_env_builder.test_output_dir, neon_env_builder
-    ).scan_metadata()
-    assert metadata_summary["count"] == 1  # Scrubber should have seen our timeline
-    assert not metadata_summary["with_errors"]
-    assert not metadata_summary["with_warnings"]
-

 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_generations = True
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -432,47 +432,3 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
-
-
-@pytest.mark.timeout(60)
-def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
-
-    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
-
-    def query(status: int, query: str) -> Any:
-        return static_proxy.http_query(
-            query,
-            [],
-            user="http_auth",
-            password="http",
-            expected_code=status,
-        )
-
-    # query generates a million rows - should hit the 10MB reponse limit quickly
-    response = query(
-        400,
-        "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
-    )
-    assert "response is too large (max is 10485760 bytes)" in response["message"]
-
-
-def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
-
-    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
-
-    def query(status: int, query: str) -> Any:
-        return static_proxy.http_query(
-            query,
-            [],
-            user="http_auth",
-            password="http",
-            expected_code=status,
-        )
-
-    response = query(
-        200,
-        "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
-    )
-    assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "763000f1d0873b827829c41f2f6f799ffc0de55c",
-    "postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
-    "postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a"
+    "postgres-v16": "550ffa6495a5dc62fccc3a8b449386633758680b",
+    "postgres-v15": "ab67ab96355d61e9d0218630be4aa7db53bf83e7",
+    "postgres-v14": "6669a672ee14ab2c09d44c4552f9a13fad3afc10"
 }