pageserver: enable setting a target disk range

pageserver: publish disk eviction status
2026-01-25 14:20:38 +00:00 · 2023-10-25 14:39:12 +01:00 · 2023-10-25 14:35:32 +01:00
144 changed files with 5868 additions and 6133 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,4 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
-  - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,10 +203,6 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
-        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
-          exit 0
-        fi
-
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -340,11 +340,11 @@ jobs:

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
+          export REMOTE_STORAGE_AZURE_REGION=eastus2
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

@@ -433,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -847,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.18.2

    steps:
      - name: Checkout
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1609,6 +1609,16 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "ctor"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "ctr"
 version = "0.6.0"
@@ -2704,10 +2714,11 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
+ "cfg-if",
 "value-bag",
 ]

@@ -6000,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

 [[package]]
 name = "value-bag"
-version = "1.4.2"
+version = "1.0.0-alpha.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
+checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
+dependencies = [
+ "ctor",
+ "version_check",
+]

 [[package]]
 name = "vcpkg"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,7 +156,6 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -278,26 +277,32 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
-                Some(
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor")
-                )
-            } else {
-                None
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
            };

            // This token is used internally by the monitor to clean up all threads
@@ -308,7 +313,8 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.clone(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +486,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
-            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
-            &[],
-        )?
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
-            replication: Some(row.get("rolreplication")),
-            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", format!("Bearer {}", jwt))
+        .header("Authorization", jwt)
        .send()
        .map_err(|e| {
            (
@@ -265,8 +265,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-                || !r.bypassrls.unwrap_or(false)
-                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,8 +296,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -19,7 +19,7 @@ const COMMAND: &str = "attachment_service";
 pub struct AttachHookRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub node_id: Option<NodeId>,
+    pub pageserver_id: Option<NodeId>,
 }

 #[derive(Serialize, Deserialize)]
@@ -85,7 +85,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join("attach_hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -93,7 +93,7 @@ impl AttachmentService {

        let request = AttachHookRequest {
            tenant_id,
-            node_id: Some(pageserver_id),
+            pageserver_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};

@@ -172,7 +171,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                gen: state.generation,
+                generation: state.generation,
            });
        }
    }
@@ -218,31 +217,14 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
+            pageserver: attach_req.pageserver_id,
            generation: 0,
        });

-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
+    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.node_id;
+    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -250,7 +232,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
+            gen: attach_req.pageserver_id.map(|_| generation),
        },
    )
 }
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach_hook", handle_attach_hook)
 }

 #[tokio::main]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,24 +798,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
-        "reconfigure" => {
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
-            let endpoint = cplane
-                .endpoints
-                .get(endpoint_id.as_str())
-                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
-                } else {
-                    None
-                };
-            endpoint.reconfigure(pageserver_id)?;
-        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1387,12 +1369,6 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
-                .subcommand(Command::new("reconfigure")
-                            .about("Reconfigure the endpoint")
-                            .arg(endpoint_pageserver_id_arg)
-                            .arg(endpoint_id_arg.clone())
-                            .arg(tenant_id_arg.clone())
-                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -414,32 +414,16 @@ impl Endpoint {
            );
        }

-        Ok(())
-    }
-
-    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-        Ok(())
-    }

-    fn read_postgresql_conf(&self) -> Result<String> {
-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => Ok(String::from_utf8(content)?),
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
-            Err(e) => Err(anyhow::Error::new(e).context(format!(
-                "failed to read config file in {}",
-                postgresql_conf_path.to_str().unwrap()
-            ))),
-        }
+        Ok(())
    }

    pub fn start(
@@ -452,7 +436,21 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        let postgresql_conf = self.read_postgresql_conf()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => String::from_utf8(content)?,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
+            Err(e) => {
+                return Err(anyhow::Error::new(e).context(format!(
+                    "failed to read config file in {}",
+                    postgresql_conf_path.to_str().unwrap()
+                )))
+            }
+        };

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -623,61 +621,6 @@ impl Endpoint {
        }
    }

-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
-        };
-
-        let postgresql_conf = self.read_postgresql_conf()?;
-        spec.cluster.postgresql_conf = Some(postgresql_conf);
-
-        if let Some(pageserver_id) = pageserver_id {
-            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
-            let mut endpoint_conf: EndpointConf = {
-                let file = std::fs::File::open(&endpoint_config_path)?;
-                serde_json::from_reader(file)?
-            };
-            endpoint_conf.pageserver_id = pageserver_id;
-            std::fs::write(
-                endpoint_config_path,
-                serde_json::to_string_pretty(&endpoint_conf)?,
-            )?;
-
-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
-        }
-
-        let client = reqwest::blocking::Client::new();
-        let response = client
-            .post(format!(
-                "http://{}:{}/configure",
-                self.http_address.ip(),
-                self.http_address.port()
-            ))
-            .body(format!(
-                "{{\"spec\":{}}}",
-                serde_json::to_string_pretty(&spec)?
-            ))
-            .send()?;
-
-        let status = response.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            Ok(())
-        } else {
-            let url = response.url().to_owned();
-            let msg = match response.text() {
-                Ok(err_body) => format!("Error: {}", err_body),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            };
-            Err(anyhow::anyhow!(msg))
-        }
-    }
-
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -686,25 +629,15 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
-        self.pg_ctl(
-            if destroy {
-                &["-m", "immediate", "stop"]
-            } else {
-                &["stop"]
-            },
-            &None,
-        )?;
-
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
-        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
+            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
+        } else {
+            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -1,108 +0,0 @@
-# Updating Postgres
-
-## Minor Versions
-
-When upgrading to a new minor version of Postgres, please follow these steps:
-
-_Example: 15.4 is the new minor version to upgrade to from 15.3._
-
-1. Clone the Neon Postgres repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/postgres.git
-    ```
-
-1. Add the Postgres upstream remote.
-
-    ```shell
-    git remote add upstream https://git.postgresql.org/git/postgresql.git
-    ```
-
-1. Create a new branch based on the stable branch you are updating.
-
-    ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
-    ```
-
-1. Tag the last commit on the stable branch you are updating.
-
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
-
-    ```shell
-    git fetch upstream REL_15_4
-    git rebase REL_15_4
-    ```
-
-1. Run the Postgres test suite to make sure our commits have not affected
-Postgres in a negative way.
-
-    ```shell
-    make check
-    # OR
-    meson test -C builddir
-    ```
-
-1. Push your branch to the Neon Postgres repository.
-
-    ```shell
-    git push origin my-branch
-    ```
-
-1. Clone the Neon repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/neon.git
-    ```
-
-1. Create a new branch.
-
-1. Change the `revisions.json` file to point at the HEAD of your Postgres
-branch.
-
-1. Update the Git submodule.
-
-    ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
-    git submodule update --remote vendor/postgres-v15
-    ```
-
-1. Run the Neon test suite to make sure that Neon is still good to go on this
-minor Postgres release.
-
-    ```shell
-    ./scripts/poetry -k pg15
-    ```
-
-1. Commit your changes.
-
-1. Create a pull request, and wait for CI to go green.
-
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
-
-    ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
-    ```
-
-    It may require disabling various branch protections.
-
-1. Update your Neon PR to point at the branches.
-
-    ```shell
-    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
-    git commit --amend --no-edit
-    git push --force origin
-    ```
-
-1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -190,8 +190,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
-    pub replication: Option<bool>,
-    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -89,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str, build_tag: &str) {
+pub fn set_build_info_metric(revision: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision", "build_tag"]
+        &["revision"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision, build_tag]).set(1);
+    metric.with_label_values(&[revision]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -17,7 +17,7 @@ pub struct ReAttachRequest {
 pub struct ReAttachResponseTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub gen: u32,
+    pub generation: u32,
 }

 #[derive(Serialize, Deserialize)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,6 +110,7 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -242,7 +242,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

-    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -394,23 +393,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S + Clone,
+        F: Fn() -> S,
        S: Future,
    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
-
-        tokio::select! {
-            _ = shutdown_watcher() => {
-                // do nothing; we most likely got already stopped by shutdown and will log it next.
-            }
-            _ = self.framed.shutdown() => {
-                // socket might be already closed, e.g. if previously received error,
-                // so ignore result.
-            },
-        }
-
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
+        // socket might be already closed, e.g. if previously received error,
+        // so ignore result.
+        self.framed.shutdown().await.ok();
        match ret {
            Ok(()) => Ok(()),
            Err(QueryError::Shutdown) => {
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
-            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,24 +214,27 @@ where
    }
 }

-/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write_buf(write_buf).await?;
+        let bytes_written = stream.write(write_buf.chunk()).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
+        // The advanced part will be garbage collected, likely during shifting
+        // data left on next attempt to write to buffer when free space is not
+        // enough.
+        write_buf.advance(bytes_written);
    }
+    write_buf.clear();
    stream.flush().await
 }

-/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,8 +23,8 @@ use tracing::debug;

 use crate::s3_bucket::RequestKind;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
+    StorageMetadata,
 };

 pub struct AzureBlobStorage {
@@ -184,11 +184,10 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {

 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_name(p))
@@ -196,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut builder = self.client.list_blobs();
-
-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+        let mut builder = self
+            .client
+            .list_blobs()
+            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());

        if let Some(prefix) = list_prefix {
            builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -219,23 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
        }

        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
-            let prefix_iter = entry
+        let mut res = Vec::new();
+        while let Some(entry) = response.next().await {
+            let entry = entry.map_err(to_download_error)?;
+            let name_iter = entry
                .blobs
                .prefixes()
                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
-
-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-            res.keys.extend(blob_iter);
+            res.extend(name_iter);
        }
        Ok(res)
    }
+
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone());
+
+        let mut builder = self.client.list_blobs();
+
+        if let Some(folder_name) = folder_name {
+            builder = builder.prefix(Cow::from(folder_name.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Vec::new();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(anyhow::Error::new)?;
+            let name_iter = entry
+                .blobs
+                .blobs()
+                .map(|bl| self.name_to_relative_path(&bl.name));
+            res.extend(name_iter);
+        }
+        Ok(res)
+    }
+
    async fn upload(
        &self,
        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -129,22 +129,6 @@ impl RemotePath {
    }
 }

-/// We don't need callers to be able to pass arbitrary delimiters: just control
-/// whether listings will use a '/' separator or not.
-///
-/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
-/// NoDelimiter mode will only populate `keys`.
-pub enum ListingMode {
-    WithDelimiter,
-    NoDelimiter,
-}
-
-#[derive(Default)]
-pub struct Listing {
-    pub prefixes: Vec<RemotePath>,
-    pub keys: Vec<RemotePath>,
-}
-
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -157,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
+    ) -> Result<Vec<RemotePath>, DownloadError>;
+
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -175,16 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
-        Ok(result)
-    }
-
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>;
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -235,9 +205,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -248,7 +215,6 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -268,19 +234,6 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
-    pub async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list(prefix, mode).await,
-            Self::AwsS3(s) => s.list(prefix, mode).await,
-            Self::AzureBlob(s) => s.list(prefix, mode).await,
-            Self::Unreliable(s) => s.list(prefix, mode).await,
-        }
-    }
-
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -75,7 +75,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,10 +89,52 @@ impl LocalFs {
            })
            .collect())
    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    ),
+            )
+        }
+
+        Ok(prefixes)
+    }

    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -144,70 +186,6 @@ impl LocalFs {

        Ok(files)
    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            let keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            result.keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();
-
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped);
-            }
-        }
-
-        Ok(result)
-    }

    async fn upload(
        &self,
@@ -501,7 +479,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list_all().await?,
+            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -689,7 +667,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list_all().await?.is_empty());
+        assert!(storage.list().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -747,43 +725,6 @@ mod fs_tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn list() -> anyhow::Result<()> {
-        // No delimiter: should recursively list everything
-        let storage = create_storage()?;
-        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
-        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
-
-        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
-        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
-
-        // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
-
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("timelines").unwrap()].to_vec()
-        );
-        assert!(listing.keys.is_empty());
-
-        // Delimiter & prefix
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
-                ListingMode::WithDelimiter,
-            )
-            .await?;
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
-        );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());
-
-        Ok(())
-    }
-
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -836,7 +777,7 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list_all().await?;
+        let mut files = storage.list().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,8 +30,8 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -299,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(
+    /// See the doc for `RemoteStorage::list_prefixes`
+    /// Note: it wont include empty "directories"
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        let kind = RequestKind::List;
-        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -314,33 +314,28 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

+        let mut document_keys = Vec::new();
+
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let mut request = self
+            let fetch_response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
-
-            if let ListingMode::WithDelimiter = mode {
-                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-            }
-
-            let response = request
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
+                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -350,35 +345,71 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
+                .observe_elapsed(kind, &fetch_response, started_at);

-            let response = response?;
+            let fetch_response = fetch_response?;

-            let keys = response.contents().unwrap_or_default();
-            let empty = Vec::new();
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-
-            for object in keys {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                result.keys.push(remote_path);
-            }
-
-            result.prefixes.extend(
-                prefixes
-                    .iter()
+            document_keys.extend(
+                fetch_response
+                    .common_prefixes
+                    .unwrap_or_default()
+                    .into_iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match response.next_continuation_token {
+            continuation_token = match fetch_response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(result)
+        Ok(document_keys)
+    }
+
+    /// See the doc for `RemoteStorage::list_files`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
+        let folder_name = folder
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        // AWS may need to break the response into several parts
+        let mut continuation_token = None;
+        let mut all_files = vec![];
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);
+
+            let response = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(folder_name.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(self.max_keys_per_list_response)
+                .send()
+                .await
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;
+
+            for object in response.contents().unwrap_or_default() {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                all_files.push(remote_path);
+            }
+            match response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+        Ok(all_files)
    }

    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-};
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
-        self.inner.list(prefix, mode).await
-    }
-
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,8 +73,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-pub mod sync;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -130,21 +128,6 @@ macro_rules! project_git_version {
    };
 }

-/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
-#[macro_export]
-macro_rules! project_build_tag {
-    ($const_identifier:ident) => {
-        const $const_identifier: &::core::primitive::str = {
-            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
-                ::core::option::Option::Some(x) => ["build_tag-env:", x],
-                ::core::option::Option::None => ["build_tag:", ""],
-            };
-
-            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
-        };
-    };
-}
-
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +0,0 @@
-pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,383 +0,0 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
-};
-use tokio::sync::Semaphore;
-
-/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
-/// for the duration of initialization.
-///
-/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
-///
-/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
-pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
-    initializers: AtomicUsize,
-}
-
-impl<T> Default for OnceCell<T> {
-    /// Create new uninitialized [`OnceCell`].
-    fn default() -> Self {
-        Self {
-            inner: Default::default(),
-            initializers: AtomicUsize::new(0),
-        }
-    }
-}
-
-/// Semaphore is the current state:
-/// - open semaphore means the value is `None`, not yet initialized
-/// - closed semaphore means the value has been initialized
-#[derive(Debug)]
-struct Inner<T> {
-    init_semaphore: Arc<Semaphore>,
-    value: Option<T>,
-}
-
-impl<T> Default for Inner<T> {
-    fn default() -> Self {
-        Self {
-            init_semaphore: Arc::new(Semaphore::new(1)),
-            value: None,
-        }
-    }
-}
-
-impl<T> OnceCell<T> {
-    /// Creates an already initialized `OnceCell` with the given value.
-    pub fn new(value: T) -> Self {
-        let sem = Semaphore::new(1);
-        sem.close();
-        Self {
-            inner: Mutex::new(Inner {
-                init_semaphore: Arc::new(sem),
-                value: Some(value),
-            }),
-            initializers: AtomicUsize::new(0),
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
-    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
-    {
-        let sem = {
-            let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
-        }
-    }
-
-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
-    /// to complete initializing the inner value.
-    ///
-    /// # Panics
-    ///
-    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
-
-        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
-        // give more permits right now.
-        if guard.init_semaphore.try_acquire().is_ok() {
-            drop(guard);
-            panic!("permit is of wrong origin");
-        }
-
-        Self::set0(value, guard)
-    }
-
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
-        if guard.value.is_some() {
-            drop(guard);
-            unreachable!("we won permit, must not be initialized");
-        }
-        guard.value = Some(value);
-        guard.init_semaphore.close();
-        Guard(guard)
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
-        if guard.value.is_some() {
-            Some(Guard(guard))
-        } else {
-            None
-        }
-    }
-
-    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
-    pub fn initializer_count(&self) -> usize {
-        self.initializers.load(Ordering::Relaxed)
-    }
-}
-
-/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
-/// initializing task for example at the end of initialization.
-struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
-
-impl<'a, T> CountWaitingInitializers<'a, T> {
-    fn start(target: &'a OnceCell<T>) -> Self {
-        target.initializers.fetch_add(1, Ordering::Relaxed);
-        CountWaitingInitializers(target)
-    }
-}
-
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
-    fn drop(&mut self) {
-        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
-    }
-}
-
-/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
-/// initialized value.
-#[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for Guard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> std::ops::DerefMut for Guard<'_, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-            .value
-            .as_mut()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<'a, T> Guard<'a, T> {
-    /// Take the current value, and a new permit for it's deinitialization.
-    ///
-    /// The permit will be on a semaphore part of the new internal value, and any following
-    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
-        let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(permit)))
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-/// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::{
-        convert::Infallible,
-        sync::atomic::{AtomicUsize, Ordering},
-        time::Duration,
-    };
-
-    #[tokio::test]
-    async fn many_initializers() {
-        #[derive(Default, Debug)]
-        struct Counters {
-            factory_got_to_run: AtomicUsize,
-            future_polled: AtomicUsize,
-            winners: AtomicUsize,
-        }
-
-        let initializers = 100;
-
-        let cell = Arc::new(OnceCell::default());
-        let counters = Arc::new(Counters::default());
-        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
-
-        let mut js = tokio::task::JoinSet::new();
-        for i in 0..initializers {
-            js.spawn({
-                let cell = cell.clone();
-                let counters = counters.clone();
-                let barrier = barrier.clone();
-
-                async move {
-                    barrier.wait().await;
-                    let won = {
-                        let g = cell
-                            .get_or_init(|permit| {
-                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
-                                async {
-                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>((i, permit))
-                                }
-                            })
-                            .await
-                            .unwrap();
-
-                        *g == i
-                    };
-
-                    if won {
-                        counters.winners.fetch_add(1, Ordering::Relaxed);
-                    }
-                }
-            });
-        }
-
-        barrier.wait().await;
-
-        while let Some(next) = js.join_next().await {
-            next.expect("no panics expected");
-        }
-
-        let mut counters = Arc::try_unwrap(counters).unwrap();
-
-        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
-        assert_eq!(*counters.future_polled.get_mut(), 1);
-        assert_eq!(*counters.winners.get_mut(), 1);
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reinit_waits_for_deinit() {
-        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
-        let sleep_for = Duration::from_secs(1);
-        let initial = 42;
-        let reinit = 1;
-        let cell = Arc::new(OnceCell::new(initial));
-
-        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
-
-        let jh = tokio::spawn({
-            let cell = cell.clone();
-            let deinitialization_started = deinitialization_started.clone();
-            async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
-                assert_eq!(answer, initial);
-
-                deinitialization_started.wait().await;
-                tokio::time::sleep(sleep_for).await;
-            }
-        });
-
-        deinitialization_started.wait().await;
-
-        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
-            .await
-            .unwrap();
-
-        let elapsed = started_at.elapsed();
-        assert!(
-            elapsed >= sleep_for,
-            "initialization should had taken at least the time time slept with permit"
-        );
-
-        jh.await.unwrap();
-
-        assert_eq!(*cell.get().unwrap(), reinit);
-    }
-
-    #[test]
-    fn reinit_with_deinit_permit() {
-        let cell = Arc::new(OnceCell::new(42));
-
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
-
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
-        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
-    }
-
-    #[tokio::test]
-    async fn initialization_attemptable_until_ok() {
-        let cell = OnceCell::default();
-
-        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
-                .await
-                .unwrap_err();
-        }
-
-        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
-            .await
-            .unwrap();
-        assert_eq!(*g, "finally success");
-    }
-
-    #[tokio::test]
-    async fn initialization_is_cancellation_safe() {
-        let cell = OnceCell::default();
-
-        let barrier = tokio::sync::Barrier::new(2);
-
-        let initializer = cell.get_or_init(|permit| async {
-            barrier.wait().await;
-            futures::future::pending::<()>().await;
-
-            Ok::<_, Infallible>(("never reached", permit))
-        });
-
-        tokio::select! {
-            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
-            _ = barrier.wait() => {}
-        };
-
-        // now initializer is dropped
-
-        assert!(cell.get().is_none());
-
-        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
-            .await
-            .unwrap();
-        assert_eq!(*g, "now initialized");
-    }
-}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl Default for FileCacheConfig {
-    fn default() -> Self {
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
-}

-impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
+            }
+
            state.filecache = Some(file_cache);
        }

@@ -233,11 +239,17 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -261,7 +273,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

            let current = last_history.avg_non_reclaimable;

@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }

            if actual_usage != expected_usage {
                warn!(
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,12 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);

 const PID_FILE_NAME: &str = "pageserver.pid";

@@ -259,12 +258,11 @@ fn start_pageserver(
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {} build_tag: {}",
+        "version: {} launch_timestamp: {}",
        version(),
-        launch_ts.to_string(),
-        BUILD_TAG,
+        launch_ts.to_string()
    );
-    set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    set_build_info_metric(GIT_VERSION);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -632,6 +633,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }

+    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_ATTACHING_MARKER_FILENAME)
+    }
+
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
@@ -1473,6 +1479,8 @@ threshold = "20m"
            Some(DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(80).unwrap(),
                min_avail_bytes: 0,
+                target_avail_bytes: None,
+                target_usage_pct: None,
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,10 +57,7 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert(
-                "Authorization",
-                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
-            );
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
            client = client.default_headers(headers);
        }

@@ -147,7 +144,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|t| (t.id, Generation::new(t.generation)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -272,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
-
-        Ok(())
+            .map_err(Into::into)
    }
 }

@@ -363,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
-            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
-            .await
-            .fatal_err("read deletion directory");
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
-                    .await
-                    .fatal_err("delete temp file");
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+                }

                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path)
-                .await
-                .fatal_err("read deletion list");
+            let list_bytes = tokio::fs::read(&list_path).await?;

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -344,7 +349,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");

        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
-                .await
-                .fatal_err("remove deletion list");
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {list_path}: {e:#}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,27 +60,47 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
-        self,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
-        Timeline,
-    },
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: Percent,
    pub min_avail_bytes: u64,
+
+    // Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
+    // we will keep evicting layers until we reach the target.  The resulting disk usage should look
+    // like a sawtooth bouncing between the upper max/min line and the lower target line.
+    #[serde(default)]
+    pub target_usage_pct: Option<Percent>,
+    #[serde(default)]
+    pub target_avail_bytes: Option<u64>,
+
    #[serde(with = "humantime_serde")]
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
 }

+#[derive(Default)]
+enum Status {
+    /// We are within disk limits, and not currently doing any eviction
+    #[default]
+    Idle,
+    /// Disk limits have been exceeded: we will evict soon
+    UnderPressure,
+    /// We are currently doing an eviction pass.
+    Evicting,
+}
+
 #[derive(Default)]
 pub struct State {
    /// Exclude http requests and background task from running at the same time.
    mutex: tokio::sync::Mutex<()>,
+
+    /// Publish the current status of eviction work, for visibility to other subsystems
+    /// that modify their behavior if disk pressure is high or if eviction is going on.
+    status: std::sync::RwLock<Status>,
 }

 pub fn launch_disk_usage_global_eviction_task(
@@ -112,7 +132,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -125,7 +145,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +169,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -174,25 +200,34 @@ async fn disk_usage_eviction_task(
 }

 pub trait Usage: Clone + Copy + std::fmt::Debug {
-    fn has_pressure(&self) -> bool;
+    fn pressure(&self) -> f64;
+    fn over_pressure(&self) -> bool;
+    fn no_pressure(&self) -> bool;
    fn add_available_bytes(&mut self, bytes: u64);
 }

 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+
+    if usage_pre.over_pressure() {
+        *state.status.write().unwrap() = Status::Evicting;
+    }
+
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
-            match outcome {
+            let new_status = match outcome {
                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
                    // nothing to do, select statement below will handle things
+                    Status::Idle
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
@@ -202,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(

                    debug!(?after, "disk usage");

-                    if after.has_pressure() {
+                    if after.over_pressure() {
                        // Don't bother doing an out-of-order iteration here now.
                        // In practice, the task period is set to a value in the tens-of-seconds range,
                        // which will cause another iteration to happen soon enough.
                        // TODO: deltas between the three different usages would be helpful,
                        // consider MiB, GiB, TiB
                        warn!(?outcome, ?after, "disk usage still high");
+                        Status::UnderPressure
                    } else {
                        info!(?outcome, ?after, "disk usage pressure relieved");
+                        Status::Idle
                    }
                }
-            }
+            };
+
+            *state.status.write().unwrap() = new_status;
        }
        Err(e) => {
            error!("disk_usage_eviction_iteration failed: {:#}", e);
+            *state.status.write().unwrap() = if usage_pre.over_pressure() {
+                Status::UnderPressure
+            } else {
+                Status::Idle
+            };
        }
    }

@@ -270,6 +314,7 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -281,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    debug!(?usage_pre, "disk usage");

-    if !usage_pre.has_pressure() {
+    if !usage_pre.over_pressure() {
        return Ok(IterationOutcome::NoPressure);
+    } else {
+        *state.status.write().unwrap() = Status::Evicting;
    }

    warn!(
@@ -326,12 +373,11 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
-        if !usage_planned.has_pressure() {
+        if usage_planned.no_pressure() {
            debug!(
                no_candidates_evicted = i,
                "took enough candidates for pressure to be relieved"
@@ -346,18 +392,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
    }

    let usage_planned = match warned {
@@ -374,101 +412,69 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    let mut js = tokio::task::JoinSet::new();
-
-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
-
+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
-
-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+        let batch_size = batch.len();

        debug!(%timeline_id, "evicting batch for timeline");

-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;

-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch, &cancel).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
+                        match result {
+                            Some(Ok(())) => {
+                                usage_assumed.add_available_bytes(file_size);
+                            }
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
+                                warn!(%layer, "failed to evict layer: {detail}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
                            }
                        }
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
-                    }
                }
-                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-            }
-        }
-        (usage_assumed, evictions_failed)
-    };
-
-    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
-        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+        if cancel.is_cancelled() {
            return Ok(IterationOutcome::Cancelled);
        }
-    };
+    }

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -483,7 +489,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Layer,
+    layer: Arc<dyn PersistentLayer>,
    last_activity_ts: SystemTime,
 }

@@ -681,22 +687,57 @@ mod filesystem_level_usage {
    }

    impl super::Usage for Usage<'_> {
-        fn has_pressure(&self) -> bool {
-            let usage_pct =
-                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+        /// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
+        ///
+        /// This is the condition for starting eviction.
+        fn over_pressure(&self) -> bool {
+            self.pressure() >= 1.0
+        }

-            let pressures = [
-                (
-                    "min_avail_bytes",
-                    self.avail_bytes < self.config.min_avail_bytes,
-                ),
-                (
-                    "max_usage_pct",
-                    usage_pct >= self.config.max_usage_pct.get() as u64,
-                ),
-            ];
+        /// Is the pressure <0, ie.. has disk usage gone below the target bound?
+        ///
+        /// This is the condition for dropping out of eviction.
+        fn no_pressure(&self) -> bool {
+            self.pressure() <= 0.0
+        }

-            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        fn pressure(&self) -> f64 {
+            let max_usage = std::cmp::min(
+                self.total_bytes - self.config.min_avail_bytes,
+                (self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
+            );
+
+            let mut target_usage = max_usage;
+            if let Some(target_avail_bytes) = self.config.target_avail_bytes {
+                target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
+            }
+            if let Some(target_usage_pct) = self.config.target_usage_pct {
+                target_usage = std::cmp::min(
+                    target_usage,
+                    (self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
+                );
+            };
+
+            let usage = self.total_bytes - self.avail_bytes;
+            eprintln!(
+                "pressure: {} {}, current {}",
+                target_usage, max_usage, usage
+            );
+            if target_usage == max_usage {
+                // We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
+                if usage >= max_usage {
+                    1.0
+                } else {
+                    0.0
+                }
+            } else if usage <= target_usage {
+                // No pressure.
+                0.0
+            } else {
+                // We are above target: pressure is the ratio of how much we exceed target to the size of the gap
+                let range_size = (max_usage - target_usage) as f64;
+                (usage - target_usage) as f64 / range_size
+            }
        }

        fn add_available_bytes(&mut self, bytes: u64) {
@@ -750,6 +791,8 @@ mod filesystem_level_usage {
            config: &DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(85).unwrap(),
                min_avail_bytes: 0,
+                target_avail_bytes: None,
+                target_usage_pct: None,
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
@@ -758,24 +801,24 @@ mod filesystem_level_usage {
            avail_bytes: 0,
        };

-        assert!(usage.has_pressure(), "expected pressure at 100%");
+        assert!(usage.over_pressure(), "expected pressure at 100%");

        usage.add_available_bytes(14_000);
-        assert!(usage.has_pressure(), "expected pressure at 86%");
+        assert!(usage.over_pressure(), "expected pressure at 86%");

        usage.add_available_bytes(999);
-        assert!(usage.has_pressure(), "expected pressure at 85.001%");
+        assert!(usage.over_pressure(), "expected pressure at 85.001%");

        usage.add_available_bytes(1);
-        assert!(usage.has_pressure(), "expected pressure at precisely 85%");
+        assert!(usage.over_pressure(), "expected pressure at precisely 85%");

        usage.add_available_bytes(1);
-        assert!(!usage.has_pressure(), "no pressure at 84.999%");
+        assert!(!usage.over_pressure(), "no pressure at 84.999%");

        usage.add_available_bytes(999);
-        assert!(!usage.has_pressure(), "no pressure at 84%");
+        assert!(!usage.over_pressure(), "no pressure at 84%");

        usage.add_available_bytes(16_000);
-        assert!(!usage.has_pressure());
+        assert!(!usage.over_pressure());
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -392,19 +392,13 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/LsnByTimestampResponse"
+                type: string
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -569,17 +563,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
+          description: Tenant download is already in progress
          content:
            application/json:
              schema:
@@ -723,12 +707,6 @@ paths:

        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1219,15 +1197,6 @@ components:
            new_tenant_id:
              type: string
              format: hex
-            generation:
-              type: integer
-              description: Attachment generation number.
-    TenantLoadRequest:
-      type: object
-      properties:
-        generation:
-          type: integer
-          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
@@ -1415,19 +1384,6 @@ components:
          type: string
          format: hex

-    LsnByTimestampResponse:
-      type: object
-      required:
-        - lsn
-        - kind
-      properties:
-        lsn:
-          type: string
-          format: hex
-        kind:
-          type: string
-          enum: [past, present, future, nodata]
-
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -17,7 +17,6 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
-use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -485,8 +484,6 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -498,32 +495,13 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    if version.unwrap_or(0) > 1 {
-        #[serde_as]
-        #[derive(serde::Serialize)]
-        struct Result {
-            #[serde_as(as = "DisplayFromStr")]
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
-    }
+    let result = match result {
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+        LsnForTimestamp::Future(_lsn) => "future".into(),
+        LsnForTimestamp::Past(_lsn) => "past".into(),
+        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+    };
+    json_response(StatusCode::OK, result)
 }

 async fn get_timestamp_of_lsn_handler(
@@ -789,10 +767,6 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
-    let accepts_html = headers
-        .get(header::ACCEPT)
-        .map(|v| v == "text/html")
-        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -800,11 +774,11 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if accepts_html {
+        if headers["Accept"] == "text/html" {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if accepts_html {
+    } else if headers["Accept"] == "text/html" {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
@@ -955,7 +929,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(header::CONTENT_TYPE, "text/html")
+        .header(hyper::header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1205,7 +1179,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1230,7 +1204,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
@@ -1336,7 +1310,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1478,10 +1452,22 @@ async fn disk_usage_eviction_run(
    }

    impl crate::disk_usage_eviction_task::Usage for Usage {
-        fn has_pressure(&self) -> bool {
+        fn over_pressure(&self) -> bool {
            self.config.evict_bytes > self.freed_bytes
        }

+        fn no_pressure(&self) -> bool {
+            !self.over_pressure()
+        }
+
+        fn pressure(&self) -> f64 {
+            if self.over_pressure() {
+                1.0
+            } else {
+                0.0
+            }
+        }
+
        fn add_available_bytes(&mut self, bytes: u64) {
            self.freed_bytes += bytes;
        }
@@ -1500,11 +1486,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };

    let state = state.disk_usage_eviction_state.clone();

@@ -1522,6 +1508,7 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
+                &storage,
                usage,
                &child_cancel,
            )
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,10 +149,6 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1388,23 +1388,28 @@ impl TimelineMetrics {
        }
    }

-    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
+    pub fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
+    pub fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
+    pub fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub(crate) fn resident_physical_size_get(&self) -> u64 {
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -552,8 +552,7 @@ impl Timeline {
                Err(e) => Err(PageReconstructError::from(e)),
            },
            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
+                warn!("Failed to get info about AUX files: {}", e);
                Ok(HashMap::new())
            }
        }
@@ -676,9 +675,8 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
+        result.add_key(AUX_FILES_KEY);
+
        Ok(result.to_keyspace())
    }

@@ -1203,8 +1201,7 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
+                warn!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -25,9 +25,11 @@ use super::{
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
 };

+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
@@ -58,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }

 async fn create_remote_delete_mark(
@@ -148,8 +150,7 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
-        let list = &list.into_iter().take(10).collect::<Vec<_>>();
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
        return Err(DeleteTenantError::Other(anyhow::anyhow!(
            "Timelines directory is not empty after all timelines deletion: {list:?}"
        )));
@@ -238,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -249,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
+/// There are three entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
+/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -350,7 +378,7 @@ impl DeleteTenantFlow {

    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
+        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -361,25 +389,66 @@ impl DeleteTenantFlow {
            )
        };

-        if remote_mark_exists {
-            return Ok(acquire(tenant));
-        }
-
        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
            Ok(acquire(tenant))
        } else {
            Ok(None)
        }
    }

+    pub(crate) async fn resume_from_load(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true, false)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, None, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -390,7 +459,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(init_order, preload, ctx)
+            .attach(ctx, super::AttachMarkerMode::Expect)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,10 +639,147 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for desc in self.iter_historic_layers() {
-            desc.dump();
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose, ctx)?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::LayerMap;
+    use crate::tenant::storage_layer::LayerFileName;
+    use std::str::FromStr;
+    use std::sync::Arc;
+
+    mod l0_delta_layers_updated {
+
+        use crate::tenant::{
+            storage_layer::{AsLayerDesc, PersistentLayerDesc},
+            timeline::layer_manager::LayerFileManager,
+        };
+
+        use super::*;
+
+        struct LayerObject(PersistentLayerDesc);
+
+        impl AsLayerDesc for LayerObject {
+            fn layer_desc(&self) -> &PersistentLayerDesc {
+                &self.0
+            }
+        }
+
+        impl LayerObject {
+            fn new(desc: PersistentLayerDesc) -> Self {
+                LayerObject(desc)
+            }
+        }
+
+        type TestLayerFileManager = LayerFileManager<LayerObject>;
+
+        #[test]
+        fn for_full_range_delta() {
+            // l0_delta_layers are used by compaction, and should observe all buffered updates
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
+        }
+
+        #[test]
+        fn for_non_full_range_delta() {
+            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
+        }
+
+        #[test]
+        fn for_image() {
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
+        }
+
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = PersistentLayerDesc::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found = Arc::new(LayerObject::new(layer.clone()));
+            let new_version = Arc::new(LayerObject::new(layer));
+
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();
+
+            let mut mapping = TestLayerFileManager::new();
+
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
+        }
+
+        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
+            let name = LayerFileName::from_str(layer_name).unwrap();
+            let skeleton = PersistentLayerDesc::from(name);
+
+            let remote = Arc::new(LayerObject::new(skeleton.clone()));
+            let downloaded = Arc::new(LayerObject::new(skeleton));
+
+            let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();
+
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+
+            let expected_in_counts = (1, usize::from(expected_l0));
+
+            map.batch_update()
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
+            );
+
+            map.batch_update().remove_historic(downloaded.layer_desc());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+        }
+
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+            let historic = map
+                .iter_historic_layers()
+                .filter(|x| x.key() == layer.key())
+                .count();
+            let l0s = map
+                .get_level0_deltas()
+                .expect("why does this return a result");
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+
+            (historic, l0)
+        }
+    }
+}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,10 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
+    TenantState,
+};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -434,15 +437,14 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        match tenant_spawn(
+        match schedule_local_tenant_processing(
            conf,
            tenant_id,
            &tenant_dir_path,
-            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
+            resources.clone(),
            Some(init_order.clone()),
            &TENANTS,
-            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -462,18 +464,15 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
-/// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_path: &Utf8Path,
-    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
+    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
-    mode: SpawnMode,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -497,24 +496,45 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!("Attaching tenant {tenant_id}");
-    let tenant = match Tenant::spawn(
-        conf,
-        tenant_id,
-        resources,
-        location_conf,
-        init_order,
-        tenants,
-        mode,
-        ctx,
-    ) {
-        Ok(tenant) => tenant,
-        Err(e) => {
-            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
+        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
+        if resources.remote_storage.is_none() {
+            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
+        } else {
+            match Tenant::spawn_attach(
+                conf,
+                tenant_id,
+                resources,
+                location_conf,
+                tenants,
+                AttachMarkerMode::Expect,
+                ctx,
+            ) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        }
+    } else {
+        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
+        // Start loading the tenant into memory. It will initially be in Loading state.
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            location_conf,
+            resources,
+            init_order,
+            tenants,
+            ctx,
+        )
    };
-
    Ok(tenant)
 }

@@ -650,41 +670,29 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
+
        let location_conf = LocationConf::attached_single(tenant_conf, generation);

        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
-
+        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let tenant_path = conf.tenant_path(&tenant_id);
-
-        let created_tenant = tenant_spawn(
-            conf,
-            tenant_id,
-            &tenant_path,
-            resources,
-            AttachedTenantConf::try_from(location_conf)?,
-            None,
-            &TENANTS,
-            SpawnMode::Create,
-            ctx,
-        )?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
+                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

        let crated_tenant_id = created_tenant.tenant_id();
        anyhow::ensure!(
-            tenant_id == crated_tenant_id,
-            "loaded created tenant has unexpected tenant id \
-                (expect {tenant_id} != actual {crated_tenant_id})",
-        );
+                tenant_id == crated_tenant_id,
+                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
+            );
        Ok(created_tenant)
-    })
-    .await
+    }).await
 }

 #[derive(Debug, thiserror::Error)]
@@ -793,10 +801,9 @@ pub(crate) async fn upsert_location(
                }
            }

-            let tenant_path = conf.tenant_path(&tenant_id);
-
            let new_slot = match &new_location_config.mode {
                LocationMode::Secondary(_) => {
+                    let tenant_path = conf.tenant_path(&tenant_id);
                    // Directory doesn't need to be fsync'd because if we crash it can
                    // safely be recreated next time this tenant location is configured.
                    unsafe_create_dir_all(&tenant_path)
@@ -826,21 +833,28 @@ pub(crate) async fn upsert_location(
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;

-                    let tenant = tenant_spawn(
+                    let tenant = match Tenant::spawn_attach(
                        conf,
                        tenant_id,
-                        &tenant_path,
                        TenantSharedResources {
                            broker_client,
                            remote_storage,
                            deletion_queue_client,
                        },
                        AttachedTenantConf::try_from(new_location_config)?,
-                        None,
                        &TENANTS,
-                        SpawnMode::Normal,
+                        // The LocationConf API does not use marker files, because we have Secondary
+                        // locations where the directory's existence is not a signal that it contains
+                        // all timelines.  See https://github.com/neondatabase/neon/issues/5550
+                        AttachMarkerMode::Ignore,
                        ctx,
-                    )?;
+                    ) {
+                        Ok(tenant) => tenant,
+                        Err(e) => {
+                            error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                        }
+                    };

                    TenantSlot::Attached(tenant)
                }
@@ -1029,7 +1043,7 @@ pub(crate) async fn load_tenant(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        let new_tenant = tenant_spawn(conf, tenant_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, None,  &TENANTS, SpawnMode::Normal, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -1103,12 +1117,18 @@ pub(crate) async fn attach_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
+        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let attached_tenant = tenant_spawn(conf, tenant_id, &tenant_dir,
-            resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx)?;
+        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
+        let marker_file_exists = conf
+            .tenant_attaching_mark_file_path(&tenant_id)
+            .try_exists()
+            .context("check for attach marker file existence")?;
+        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
+
+        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,7 +57,8 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously.
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,15 +167,39 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
+//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
+//! We keep track of the fact that a client is in `Attaching` state in a marker
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
+//!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
+//! Theoretically, it should be ok to remove and re-add remote storage configuration to
+//! the pageserver config at any time, since it doesn't make a difference to
+//! [`Timeline::load_layer_map`].
+//! Of course, the remote timeline dir must not change while we have de-configured
+//! remote storage, i.e., the pageserver must remain the owner of the given prefix
+//! in remote storage.
+//! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -187,7 +211,8 @@ mod upload;
 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-
+// re-export these
+pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -212,7 +237,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -230,13 +255,10 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
-pub(crate) use index::LayerFileMetadata;
-
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(
-        &self,
-        cancel: CancellationToken,
-    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
-            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -609,203 +627,101 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub(crate) fn schedule_layer_file_upload(
+    pub fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer: ResidentLayer,
+        layer_file_name: &LayerFileName,
+        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        self.schedule_layer_file_upload0(upload_queue, layer);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_layer_file_upload0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        layer: ResidentLayer,
-    ) {
-        let metadata = layer.metadata();
-
        upload_queue
            .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
+
+        info!("scheduled layer file upload {layer_file_name}");
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
    }

    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local filesystem state.
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until all previously scheduled
+    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
-
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
-
-        // Launch the tasks immediately, if possible
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
-    /// layer files, leaving them dangling.
-    ///
-    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
-    /// is invoked on them.
-    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        // just forget the return value; after uploading the next index_part.json, we can consider
-        // the layer files as "dangling". this is fine, at worst case we create work for the
-        // scrubber.
-
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-
-        self.launch_queued_tasks(upload_queue);
-
-        Ok(())
-    }
-
-    /// Update the remote index file, removing the to-be-deleted files from the index,
-    /// allowing scheduling of actual deletions later.
-    fn schedule_unlinking_of_layers_from_index_part0<I>(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        names: I,
-    ) -> Vec<(LayerFileName, Generation)>
-    where
-        I: IntoIterator<Item = LayerFileName>,
-    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Decorate our list of names with each name's generation, dropping
-        // names that are unexpectedly missing from our metadata.
-        let with_generations: Vec<_> = names
-            .into_iter()
-            .filter_map(|name| {
-                let meta = upload_queue.latest_files.remove(&name);
+        // Update the remote index file, removing the to-be-deleted files from the index,
+        // before deleting the actual files.
+        //
+        // Once we start removing files from upload_queue.latest_files, there's
+        // no going back! Otherwise, some of the files would already be removed
+        // from latest_files, but not yet scheduled for deletion. Use a closure
+        // to syntactically forbid ? or bail! calls here.
+        let no_bail_here = || {
+            // Decorate our list of names with each name's generation, dropping
+            // makes that are unexpectedly missing from our metadata.
+            let with_generations: Vec<_> = names
+                .into_iter()
+                .filter_map(|name| {
+                    // Remove from latest_files, learning the file's remote generation in the process
+                    let meta = upload_queue.latest_files.remove(&name);

-                if let Some(meta) = meta {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
-                } else {
-                    // This can only happen if we forgot to to schedule the file upload
-                    // before scheduling the delete. Log it because it is a rare/strange
-                    // situation, and in case something is misbehaving, we'd like to know which
-                    // layers experienced this.
-                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
-                    None
-                }
-            })
-            .collect();
+                    if let Some(meta) = meta {
+                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                        Some((name, meta.generation))
+                    } else {
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
+                        None
+                    }
+                })
+                .collect();

-        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
-            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
-                if &unexpected == gen {
-                    tracing::error!("{name} was unlinked twice with same generation");
-                } else {
-                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
-                }
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata);
            }
-        }

-        // after unlinking files from the upload_queue.latest_files we must always schedule an
-        // index_part update, because that needs to be uploaded before we can actually delete the
-        // files.
-        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, metadata);
-        }
-
-        with_generations
-    }
-
-    /// Schedules deletion for layer files which have previously been unlinked from the
-    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
-    pub(crate) fn schedule_deletion_of_unlinked(
-        self: &Arc<Self>,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        self.schedule_deletion_of_unlinked0(upload_queue, layers);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_deletion_of_unlinked0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        with_generations: Vec<(LayerFileName, Generation)>,
-    ) {
-        for (name, gen) in &with_generations {
-            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-        }
-
-        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
-            match upload_queue.dangling_files.remove(name) {
-                Some(same) if &same == gen => { /* expected */ }
-                Some(other) => {
-                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
-                }
-                None => {
-                    tracing::error!("{name} was unlinked but was not dangling");
-                }
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
            }
-        }

-        // schedule the actual deletions
-        let op = UploadOp::Delete(Delete {
-            layers: with_generations,
-        });
-        self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
-    }
-
-    /// Schedules a compaction update to the remote `index_part.json`.
-    ///
-    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
-    pub(crate) fn schedule_compaction_update(
-        self: &Arc<Self>,
-        compacted_from: &[Layer],
-        compacted_to: &[ResidentLayer],
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        for layer in compacted_to {
-            self.schedule_layer_file_upload0(upload_queue, layer.clone());
-        }
-
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-        self.launch_queued_tasks(upload_queue);
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);

+            // Launch the tasks immediately, if possible
+            self.launch_queued_tasks(upload_queue);
+        };
+        no_bail_here();
        Ok(())
    }

@@ -1177,12 +1093,16 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path();
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
+                    let path = self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(layer_file_name.file_name());
+
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        path,
+                        &path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1456,8 +1376,6 @@ impl RemoteTimelineClient {
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
-                        #[cfg(feature = "testing")]
-                        dangling_files: HashMap::default(),
                    };

                    let upload_queue = std::mem::replace(
@@ -1501,6 +1419,13 @@ impl RemoteTimelineClient {
            }
        }
    }
+
+    pub(crate) fn get_layer_metadata(
+        &self,
+        name: &LayerFileName,
+    ) -> anyhow::Result<Option<LayerFileMetadata>> {
+        self.upload_queue.lock().unwrap().get_layer_metadata(name)
+    }
 }

 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1588,7 +1513,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1731,11 +1655,7 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let initial_index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1761,29 +1681,32 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
+        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        let content_2 = dummy_contents("bar");
+        let content_3 = dummy_contents("baz");

-        let layers = [
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
-        ]
-        .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
-            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
-
-            Layer::for_resident(
-                harness.conf,
-                &timeline,
-                name,
-                LayerFileMetadata::new(contents.len() as u64, generation),
-            )
-        }).collect::<Vec<_>>();
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }

        client
-            .schedule_layer_file_upload(layers[0].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, generation),
+            )
            .unwrap();
        client
-            .schedule_layer_file_upload(layers[1].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64, generation),
+            )
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1824,11 +1747,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1841,42 +1760,38 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(layers[2].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64, generation),
+            )
            .unwrap();
-
-        // this is no longer consistent with how deletion works with Layer::drop, but in this test
-        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
-        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert_eq!(upload_queue.queued_operations.len(), 2);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
-            assert_eq!(
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
-                0
-            );
+            assert!(upload_queue.queued_operations.len() == 2);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
+            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1890,8 +1805,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
+                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1920,13 +1835,6 @@ mod tests {
        )
        .unwrap();

-        let layer_file_1 = Layer::for_resident(
-            harness.conf,
-            &timeline,
-            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-        );
-
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1962,7 +1870,10 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(layer_file_1.clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+            )
            .unwrap();

        let actual_b = get_bytes_started_stopped();
@@ -2027,7 +1938,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file()
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

@@ -170,43 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+) -> anyhow::Result<HashSet<TimelineId>> {
    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-        &format!("list timelines for {tenant_id}"),
-        cancel,
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&remote_path)),
+        &format!("list prefixes for {tenant_id}"),
    )
    .await?;

-    let mut timeline_ids = HashSet::new();
-    let mut other_prefixes = HashSet::new();
+    if timelines.is_empty() {
+        anyhow::bail!("no timelines found on the remote storage")
+    }
+
+    let mut timeline_ids = HashSet::new();
+
+    for timeline_remote_storage_key in timelines {
+        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
+            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
+            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
+            continue;
+        }

-    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
-            Err(_) => other_prefixes.insert(object_name.to_string()),
-        };
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
+
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
    }

-    for key in listing.keys {
-        let object_name = key
-            .object_name()
-            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
-        other_prefixes.insert(object_name.to_string());
-    }
-
-    Ok((timeline_ids, other_prefixes))
+    Ok(timeline_ids)
 }

 async fn do_download_index_part(
@@ -214,11 +224,10 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

-    let index_part_bytes = download_retry_forever(
+    let index_part_bytes = download_retry(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;

@@ -233,7 +242,6 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
-        cancel,
    )
    .await?;

@@ -255,28 +263,19 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
-            .await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -296,14 +295,8 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation.previous(),
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -347,14 +340,13 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
-                .await
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
        }
    }
 }
@@ -384,23 +376,3 @@ where
    )
    .await
 }
-
-async fn download_retry_forever<T, O, F>(
-    op: O,
-    description: &str,
-    cancel: CancellationToken,
-) -> Result<T, DownloadError>
-where
-    O: FnMut() -> F,
-    F: Future<Output = Result<T, DownloadError>>,
-{
-    backoff::retry(
-        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        u32::MAX,
-        description,
-        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
-    )
-    .await
-}
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -98,7 +98,7 @@ impl IndexPart {
    const LATEST_VERSION: usize = 4;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];

    pub const FILE_NAME: &'static str = "index_part.json";

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -72,8 +72,6 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
-            //
-            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,21 +4,26 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
-mod layer;
 mod layer_desc;
+mod remote_layer;

+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
+use anyhow::Result;
 use bytes::Bytes;
+use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,8 +39,7 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-
-pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
+pub use remote_layer::RemoteLayer;

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -70,7 +74,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
+/// Return value from Layer::get_page_reconstruct_data
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,6 +179,26 @@ impl LayerAccessStats {
        new
    }

+    /// Creates a clone of `self` and records `new_status` in the clone.
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    pub(crate) fn clone_for_residence_change(
+        &self,
+        new_status: LayerResidenceStatus,
+    ) -> LayerAccessStats {
+        let clone = {
+            let inner = self.0.lock().unwrap();
+            inner.clone()
+        };
+        let new = LayerAccessStats(Mutex::new(clone));
+        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new
+    }
+
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -297,12 +321,95 @@ impl LayerAccessStats {
    }
 }

+/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
+/// required by [`LayerMap`](super::layer_map::LayerMap).
+///
+/// All layers should implement a minimal `std::fmt::Debug` without tenant or
+/// timeline names, because those are known in the context of which the layers
+/// are used in (timeline).
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+    ///
+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// See PageReconstructResult for possible return values. The collected data
+    /// is appended to reconstruct_data; the caller should pass an empty struct
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns ValueReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;
+}
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
+///
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN.
+pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// File name used for this layer, both in the pageserver's local filesystem
+    /// state as well as in the remote storage.
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }
+
+    // Path to the layer file in the local filesystem.
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<Utf8PathBuf>;
+
+    /// Permanently remove this layer from disk.
+    fn delete_resident_layer_file(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
+
+    fn access_stats(&self) -> &LayerAccessStats;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
+}
+
 pub mod tests {
    use super::*;

@@ -340,6 +447,19 @@ pub mod tests {
    }
 }

+/// Helper enum to hold a PageServerConf, or a path
+///
+/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
+/// global config, and paths to layer files are constructed using the tenant/timeline
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
+/// struct for a file on disk, without having a page server running, so that we have no
+/// config. In that case, we use the Path variant to hold the full path to the file on
+/// disk.
+enum PathOrConf {
+    Path(Utf8PathBuf),
+    Conf(&'static PageServerConf),
+}
+
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,17 +34,18 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -58,7 +59,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -178,12 +182,20 @@ impl DeltaKey {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
+/// DeltaLayer is the in-memory data structure associated with an on-disk delta
+/// file.
+///
+/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct DeltaLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -200,8 +212,6 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

-/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
-/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -211,6 +221,12 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -220,6 +236,19 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
+impl Layer for DeltaLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -233,9 +262,40 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

+impl PersistentLayer for DeltaLayer {
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        Some(self)
+    }
+
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
+        println!(
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end,
+            self.desc.file_size,
+        );

        if !verbose {
            return Ok(());
@@ -243,7 +303,119 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        inner.dump(ctx).await
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );
+
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
+
+        // A subroutine to dump a single blob
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    let err: anyhow::Error = err;
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+
+        ensure!(self.desc.key_range.contains(&key));
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
+    }
+
+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        let access_stats = self.access_stats.as_api_model(reset);
+
+        HistoricLayerInfo::Delta {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start: lsn_range.start,
+            lsn_end: lsn_range.end,
+            remote: false,
+            access_stats,
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        fname: &DeltaFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.clone(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(tenant_id, timeline_id)
+                .join(fname.to_string()),
+        }
    }

    fn temp_path_for(
@@ -289,21 +461,52 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(Arc::new(loaded))
    }

+    /// Create a DeltaLayer struct representing an existing file on disk.
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &DeltaFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> DeltaLayer {
+        DeltaLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+                file_size,
+            ),
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -317,7 +520,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -330,9 +533,29 @@ impl DeltaLayer {
        })
    }

+    fn layer_name(&self) -> DeltaFileName {
+        self.desc.delta_file_name()
+    }
    /// Path to the layer file in pageserver workdir.
-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            &self.desc.tenant_id,
+            &self.desc.timeline_id,
+            &self.layer_name(),
+        )
+    }
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .await
+            .context("load delta layer keys")?;
+        DeltaLayerInner::load_keys(inner, ctx)
+            .await
+            .context("Layer index is corrupted")
    }
 }

@@ -437,7 +660,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -494,21 +717,37 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-
-        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_start..key_end,
-            self.lsn_range.clone(),
-            metadata.len(),
-        );
+        let layer = DeltaLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+                metadata.len(),
+            ),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            &self.tenant_id,
+            &self.timeline_id,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
-
-        trace!("created delta layer {}", layer.local_path());
+        trace!("created delta layer {final_path}");

        Ok(layer)
    }
@@ -589,12 +828,8 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end).await
    }
 }

@@ -732,17 +967,15 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a>(
-        &'a self,
-        ctx: &RequestContext,
+    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
+        this: &'a T,
+        ctx: &'b RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
+        let dl = this.as_ref();
+        let file = &dl.file;

-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
+        let tree_reader =
+            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -755,7 +988,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
+                            Adapter(dl),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -782,61 +1015,10 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
-
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        println!(
-            "index_start_blk: {}, root {}",
-            self.index_start_blk, self.index_root_blk
-        );
-
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = self.load_keys(ctx).await?;
-
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -876,9 +1058,3 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
-
-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,23 +31,21 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -58,7 +56,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -116,14 +114,22 @@ impl Summary {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
+///
+/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<ImageLayerInner>,
 }

@@ -140,8 +146,6 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -162,11 +166,73 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-impl ImageLayerInner {
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
+#[async_trait::async_trait]
+impl Layer for ImageLayer {
+    /// Look up given page in the file
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for ImageLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn,
+            self.desc.is_incremental(),
+            self.desc.file_size
+        );
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let file = &inner.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

        tree_reader.dump().await?;

@@ -184,36 +250,69 @@ impl ImageLayerInner {

        Ok(())
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.desc.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, reconstruct_state, ctx)
+            .await
+            // FIXME: makes no sense to dump paths
+            .with_context(|| format!("read {}", self.path()))
    }
-}

-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-
-        inner.dump(ctx).await?;

+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
        Ok(())
    }

+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_start = self.layer_desc().image_layer_lsn();
+
+        HistoricLayerInfo::Image {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start,
+            remote: false,
+            access_stats: self.access_stats.as_api_model(reset),
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        fname: &ImageFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.to_path_buf(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(&tenant_id, &timeline_id)
+                .join(fname.to_string()),
+        }
+    }
+
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -249,21 +348,54 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let expected_summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
+                .await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(loaded)
    }

+    /// Create an ImageLayer struct representing an existing file on disk
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &ImageFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> ImageLayer {
+        ImageLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                file_size,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
+            lsn: filename.lsn,
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -275,7 +407,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -289,8 +421,18 @@ impl ImageLayer {
        })
    }

-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    fn layer_name(&self) -> ImageFileName {
+        self.desc.image_file_name()
+    }
+
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
+            &self.layer_name(),
+        )
    }
 }

@@ -462,7 +604,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -516,14 +658,33 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
+        let layer = ImageLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc,
+            lsn: self.lsn,
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;

-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = ImageLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timeline_id,
+            self.tenant_id,
+            &ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn,
+            },
+        );
+        std::fs::rename(self.path, final_path)?;

-        trace!("created image layer {}", layer.local_path());
+        trace!("created image layer {}", layer.path());

        Ok(layer)
    }
@@ -585,11 +746,8 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish().await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,12 +10,11 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -29,7 +28,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -208,6 +207,20 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
+impl Layer for InMemoryLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+            .await
+    }
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -216,13 +229,17 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
+    ///
    /// Get layer size.
+    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

+    ///
    /// Create a new, empty, in-memory layer
+    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -314,11 +331,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
-        &self,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -363,8 +376,7 @@ impl InMemoryLayer {
            }
        }

-        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -5,7 +6,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -99,22 +100,6 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn from_filename(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        filename: LayerFileName,
-        file_size: u64,
-    ) -> Self {
-        match filename {
-            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
-            }
-            LayerFileName::Delta(d) => {
-                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
-            }
-        }
-    }
-
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -188,31 +173,21 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self) {
-        if self.is_delta {
-            println!(
-                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.lsn_range.start,
-                self.lsn_range.end,
-                self.is_incremental(),
-                self.file_size,
-            );
-        } else {
-            println!(
-                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.image_layer_lsn(),
-                self.is_incremental(),
-                self.file_size
-            );
-        }
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental(),
+            self.file_size,
+        );
+
+        Ok(())
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,216 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::timeline::layer_manager::LayerManager;
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+use pageserver_api::models::HistoricLayerInfo;
+use std::ops::Range;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName};
+use super::{
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+};
+
+/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
+/// [`DeltaLayer`].
+///
+/// RemoteLayer might be downloaded on-demand during operations which are
+/// allowed download remote layers and during which, it gets replaced with a
+/// concrete `DeltaLayer` or `ImageLayer`.
+///
+/// See: [`crate::context::RequestContext`] for authorization to download
+pub struct RemoteLayer {
+    pub desc: PersistentLayerDesc,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    access_stats: LayerAccessStats,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    ///
+    /// [`ongoing_download`]: Self::ongoing_download
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RemoteLayer")
+            .field("file_name", &self.desc.filename())
+            .field("layer_metadata", &self.layer_metadata)
+            .field("is_incremental", &self.desc.is_incremental())
+            .finish()
+    }
+}
+
+#[async_trait::async_trait]
+impl Layer for RemoteLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for RemoteLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        None
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        if self.desc.is_delta {
+            HistoricLayerInfo::Delta {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                lsn_end: lsn_range.end,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        } else {
+            HistoricLayerInfo::Image {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        }
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub(crate) fn create_downloaded_layer(
+        &self,
+        _layer_map_lock_held_witness: &LayerManager,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        } else {
+            let fname = self.desc.image_file_name();
+            Arc::new(ImageLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        }
+    }
+}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion};
+use utils::completion;

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
@@ -139,10 +139,7 @@ pub fn start_background_loops(
 /// Compaction task's main loop
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
-
+    let wait_duration = Duration::from_secs(2);
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -179,19 +176,9 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
-                    );
-                    Duration::from_secs_f64(wait_duration)
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
                } else {
-                    error_run_count = 0;
                    period
                }
            };
@@ -215,10 +202,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 /// GC task's main loop
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
-
+    let wait_duration = Duration::from_secs(2);
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -260,19 +244,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
                    .await;
                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    error!(
-                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
-                    );
-                    Duration::from_secs_f64(wait_duration)
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
                } else {
-                    error_run_count = 0;
                    period
                }
            };
@@ -361,7 +335,7 @@ pub(crate) fn warn_when_period_overrun(
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
        // intelligent. however it makes sense to keep the "configuration format" for period, even
        // though there's no way to output the actual config value.
-        info!(
+        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
            ?task,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -38,14 +38,6 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    }
    debug!("wal receiver shutdown confirmed");

-    // Shut down the layer flush task before the remote client, as one depends on the other
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
    // Prevent new uploads from starting.
    if let Some(remote_client) = timeline.remote_client.as_ref() {
        let res = remote_client.stop();
@@ -302,7 +294,6 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
-        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,6 +29,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
+        storage_layer::PersistentLayer,
        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
@@ -209,26 +210,15 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+        let candidates: Vec<Arc<dyn PersistentLayer>> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
+                if hist_layer.is_remote_layer() {
+                    continue;
+                }

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -259,7 +249,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    candidates.push(hist_layer)
                }
            }
            candidates
@@ -278,7 +268,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates, cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -289,7 +279,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for result in results {
+        for (l, result) in candidates.iter().zip(results) {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -297,10 +287,24 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::FileNotFound)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
+                Some(Err(
+                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
+                )) => {
+                    let e = utils::error::report_compact_sources(&e);
+                    warn!(layer = %l, "failed to evict layer: {e}");
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::MetadataInconsistency(detail))) => {
+                    warn!(layer = %l, "failed to evict layer: {detail}");
+                    stats.not_evictable += 1;
+                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -12,16 +12,27 @@ use crate::{
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
-            ResidentLayer,
+            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
+            PersistentLayerDesc, PersistentLayerKey,
        },
+        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+    layer_fmgr: LayerFileManager,
+}
+
+/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
+/// scheduling deletes in remote client.
+pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
+
+impl ApplyGcResultGuard<'_> {
+    pub(crate) fn flush(self) {
+        self.0.flush();
+    }
 }

 impl LayerManager {
@@ -32,7 +43,7 @@ impl LayerManager {
        }
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -44,12 +55,21 @@ impl LayerManager {
        &self.layer_map
    }

+    /// Replace layers in the layer file manager, used in evictions and layer downloads.
+    pub(crate) fn replace_and_verify(
+        &mut self,
+        expected: Arc<dyn PersistentLayer>,
+        new: Arc<dyn PersistentLayer>,
+    ) -> Result<()> {
+        self.layer_fmgr.replace_and_verify(expected, new)
+    }
+
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Layer>,
+        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -144,19 +164,10 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub(crate) fn track_new_image_layers(
-        &mut self,
-        image_layers: &[ResidentLayer],
-        metrics: &TimelineMetrics,
-    ) {
+    pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-
-            // record these here instead of Layer::finish_creating because otherwise partial
-            // failure with create_image_layers would balloon up the physical size gauge. downside
-            // is that all layers need to be created before metrics are updated.
-            metrics.record_new_file_metrics(layer.layer_desc().file_size);
+            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }
@@ -164,71 +175,76 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<&ResidentLayer>,
+        delta_layer: Option<DeltaLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
-        metrics: &TimelineMetrics,
    ) {
-        let inmem = self
-            .layer_map
-            .frozen_layers
-            .pop_front()
-            .expect("there must be a inmem layer to flush");
+        let l = self.layer_map.frozen_layers.pop_front();
+        let mut updates = self.layer_map.batch_update();

-        // Only one task may call this function at a time (for this
-        // timeline). If two tasks tried to flush the same frozen
+        // Only one thread may call this function at a time (for this
+        // timeline). If two threads tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
+        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));

-        if let Some(l) = delta_layer {
-            let mut updates = self.layer_map.batch_update();
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
-            updates.flush();
+        if let Some(delta_layer) = delta_layer {
+            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
        }
+        updates.flush();
    }

    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: &[Layer],
-        compact_to: &[ResidentLayer],
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: Vec<Arc<dyn PersistentLayer>>,
+        compact_to: Vec<Arc<dyn PersistentLayer>>,
        metrics: &TimelineMetrics,
-    ) {
+    ) -> Result<()> {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
+            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in compact_from {
-            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
+            // NB: the layer file identified by descriptor `l` is guaranteed to be present
+            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
+            // time, even though we dropped `Timeline::layers` inbetween.
+            Self::delete_historic_layer(
+                layer_removal_cs.clone(),
+                l,
+                &mut updates,
+                metrics,
+                &mut self.layer_fmgr,
+            )?;
        }
        updates.flush();
+        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Layer>,
-    ) {
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Arc<dyn PersistentLayer>>,
+        metrics: &TimelineMetrics,
+    ) -> Result<ApplyGcResultGuard> {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs,
-                &doomed_layer,
+                layer_removal_cs.clone(),
+                doomed_layer,
                &mut updates,
+                metrics,
                &mut self.layer_fmgr,
-            );
+            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
        }
-        updates.flush()
+        Ok(ApplyGcResultGuard(updates))
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Layer,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
+        mapping: &mut LayerFileManager,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
@@ -238,12 +254,17 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: &Layer,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
-    ) {
+        metrics: &TimelineMetrics,
+        mapping: &mut LayerFileManager,
+    ) -> anyhow::Result<()> {
        let desc = layer.layer_desc();
+        if !layer.is_remote_layer() {
+            layer.delete_resident_layer_file()?;
+            metrics.resident_physical_size_sub(desc.file_size);
+        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -252,18 +273,21 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.garbage_collect_on_drop();
+
+        Ok(())
    }

-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
+pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
+    HashMap<PersistentLayerKey, Arc<T>>,
+);

-impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -273,14 +297,14 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: T) {
+    pub(crate) fn insert(&mut self, layer: Arc<T>) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &T) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -288,7 +312,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: &T) {
+    pub(crate) fn remove(&mut self, layer: Arc<T>) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -297,4 +321,39 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            )
        }
    }
+
+    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
+        let key = expected.layer_desc().key();
+        let other = new.layer_desc().key();
+
+        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
+        let new_l0 = LayerMap::is_l0(new.layer_desc());
+
+        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
+            "layermap-replace-notfound"
+        ));
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new layer have different keys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
+        );
+
+        if let Some(layer) = self.0.get_mut(&key) {
+            anyhow::ensure!(
+                compare_arced_layers(&expected, layer),
+                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
+                expected = Arc::as_ptr(&expected),
+                new = Arc::as_ptr(layer),
+            );
+            *layer = new;
+            Ok(())
+        } else {
+            anyhow::bail!("layer was not found");
+        }
+    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -26,7 +26,8 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::{BrokerClientChannel, Code, Streaming};
+use storage_broker::BrokerClientChannel;
+use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

@@ -136,17 +137,8 @@ pub(super) async fn connection_manager_loop_step(
            broker_update = broker_subscription.message() => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
-                    Err(status) => {
-                        match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
-                                // tonic's error handling doesn't provide a clear code for disconnections: we get
-                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
-                                info!("broker disconnected: {status}");
-                            },
-                            _ => {
-                                warn!("broker subscription failed: {status}");
-                            }
-                        }
+                    Err(e) => {
+                        error!("broker subscription failed: {e}");
                        return ControlFlow::Continue(());
                    }
                    Ok(None) => {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -122,7 +122,7 @@ pub(super) async fn handle_walreceiver_connection(
    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");

-    let (replication_client, connection) = {
+    let (mut replication_client, connection) = {
        let mut config = wal_source_connconf.to_tokio_postgres_config();
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
@@ -205,7 +205,7 @@ pub(super) async fn handle_walreceiver_connection(
        gauge.dec();
    }

-    let identify = identify_system(&replication_client).await?;
+    let identify = identify_system(&mut replication_client).await?;
    info!("{identify:?}");

    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
@@ -444,7 +444,7 @@ struct IdentifySystem {
 struct IdentifyError;

 /// Run the postgres `IDENTIFY_SYSTEM` command
-async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {
+async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem> {
    let query_str = "IDENTIFY_SYSTEM";
    let response = client.simple_query(query_str).await?;

@@ -459,7 +459,7 @@ async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {

    // extract the row contents into an IdentifySystem struct.
    // written as a closure so I can use ? for Option here.
-    if let Some(SimpleQueryMessage::Row(first_row)) = response.first() {
+    if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
        Ok(IdentifySystem {
            systemid: get_parse(first_row, 0)?,
            timeline: get_parse(first_row, 1)?,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,4 @@
 use super::storage_layer::LayerFileName;
-use super::storage_layer::ResidentLayer;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -80,14 +79,6 @@ pub(crate) struct UploadQueueInitialized {
    /// tasks to finish. For example, metadata upload cannot be performed before all
    /// preceding layer file uploads have completed.
    pub(crate) queued_operations: VecDeque<UploadOp>,
-
-    /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
-    /// for error logging.
-    ///
-    /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
-    /// bug causing leaks, then it's better to not leave this enabled for production builds.
-    #[cfg(feature = "testing")]
-    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
 }

 impl UploadQueueInitialized {
@@ -144,8 +135,6 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
-            #[cfg(feature = "testing")]
-            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -191,8 +180,6 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
-            #[cfg(feature = "testing")]
-            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -216,6 +203,18 @@ impl UploadQueue {
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
+
+    pub(crate) fn get_layer_metadata(
+        &self,
+        name: &LayerFileName,
+    ) -> anyhow::Result<Option<LayerFileMetadata>> {
+        match self {
+            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
+        }
+    }
 }

 /// An in-progress upload or delete task.
@@ -238,7 +237,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(ResidentLayer, LayerFileMetadata),
+    UploadLayer(LayerFileName, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -253,13 +252,13 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(layer, metadata) => {
+            UploadOp::UploadLayer(path, metadata) => {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
+                    path.file_name(),
                    metadata.file_size(),
-                    metadata.generation
+                    metadata.generation,
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
-use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -174,78 +173,37 @@ impl OpenFiles {
    }
 }

-/// Identify error types that should alwways terminate the process.  Other
-/// error types may be elegible for retry.
-pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
-    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
-        Some(EIO) => {
-            // Terminate on EIO because we no longer trust the device to store
-            // data safely, or to uphold persistence guarantees on fsync.
-            true
-        }
-        Some(EROFS) => {
-            // Terminate on EROFS because a filesystem is usually remounted
-            // readonly when it has experienced some critical issue, so the same
-            // logic as EIO applies.
-            true
-        }
-        Some(EACCES) => {
-            // Terminate on EACCESS because we should always have permissions
-            // for our own data dir: if we don't, then we can't do our job and
-            // need administrative intervention to fix permissions.  Terminating
-            // is the best way to make sure we stop cleanly rather than going
-            // into infinite retry loops, and will make it clear to the outside
-            // world that we need help.
-            true
-        }
-        _ => {
-            // Treat all other local file I/O errors are retryable.  This includes:
-            // - ENOSPC: we stay up and wait for eviction to free some space
-            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
-            // - WriteZero, Interrupted: these are used internally VirtualFile
-            false
-        }
-    }
+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
 }
-
-/// Call this when the local filesystem gives us an error with an external
-/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
-/// bad storage or bad configuration, and we can't fix that from inside
-/// a running process.
-pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
-    std::process::abort();
-}
-
-pub(crate) trait MaybeFatalIo<T> {
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
-    fn fatal_err(self, context: &str) -> T;
-}
-
-impl<T> MaybeFatalIo<T> for std::io::Result<T> {
-    /// Terminate the process if the result is an error of a fatal type, else pass it through
-    ///
-    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
-    /// not on ENOSPC.
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
-        if let Err(e) = &self {
-            if is_fatal_io_error(e) {
-                on_fatal_io_error(e, context);
-            }
-        }
-        self
-    }
-
-    /// Terminate the process on any I/O error.
-    ///
-    /// This is appropriate for reads on files that we know exist: they should always work.
-    fn fatal_err(self, context: &str) -> T {
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
        match self {
-            Ok(v) => v,
-            Err(e) => {
-                on_fatal_io_error(&e, context);
-            }
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
        }
    }
 }
@@ -326,13 +284,15 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> std::io::Result<()> {
+    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -341,20 +301,31 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await?;
-        file.write_all(content).await?;
-        file.sync_all().await?;
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
        Ok(())
    }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -443,7 +443,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &DecodedWALRecord,
+        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
@@ -749,7 +749,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &DecodedWALRecord,
+        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -857,8 +857,7 @@ impl WalRedoProcess {
            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
+            } else if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
            }
@@ -908,8 +907,7 @@ impl WalRedoProcess {
                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
+                } else if out_revents.contains(PollFlags::POLLHUP) {
                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
                }
            }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,7 +19,6 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
-#include "c.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -64,21 +63,6 @@ int			max_reconnect_attempts = 60;
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
-
-
-static pqsigfunc	 prev_signal_handler;
-
-static void
-pageserver_sighup_handler(SIGNAL_ARGS)
-{
-	if (prev_signal_handler)
-	{
-        	prev_signal_handler(postgres_signal_arg);
-	}
-	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
-	pageserver_disconnect();
-}

 static bool
 pageserver_connect(int elevel)
@@ -416,7 +400,7 @@ pg_init_libpagestore(void)
 							   NULL,
 							   &page_server_connstring,
 							   "",
-							   PGC_SIGHUP,
+							   PGC_POSTMASTER,
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);

@@ -498,8 +482,5 @@ pg_init_libpagestore(void)
 		old_redo_read_buffer_filter = redo_read_buffer_filter;
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
-
-        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
-
 	lfc_init();
 }
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -201,16 +201,6 @@ WalRedoMain(int argc, char *argv[])
 #endif

 	am_wal_redo_postgres = true;
-	/*
-	 * Pageserver treats any output to stderr as an ERROR, so we must
-	 * set the log level as early as possible to only log FATAL and 
-	 * above during WAL redo (note that loglevel ERROR also logs LOG,
-	 * which is super strange but that's not something we can solve
-	 * for here. ¯\_(-_-)_/¯
-	 */
-	SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE);
-	SetConfigOption("client_min_messages", "ERROR", PGC_SUSET,
-					PGC_S_OVERRIDE);

 	/*
 	 * WAL redo does not need a large number of buffers. And speed of
@@ -895,12 +885,7 @@ apply_error_callback(void *arg)
 	StringInfoData buf;

 	initStringInfo(&buf);
-#if PG_VERSION_NUM >= 150000
-	if (record->record)
-#else
-	if (record->decoded_record)
-#endif
-		xlog_outdesc(&buf, record);
+	xlog_outdesc(&buf, record);

 	/* translator: %s is a WAL record description */
 	errcontext("WAL redo at %X/%X for %s",
--- a/poetry.lock
+++ b/poetry.lock
@@ -2447,20 +2447,20 @@ test = ["websockets"]

 [[package]]
 name = "werkzeug"
-version = "3.0.1"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
-    {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]

 [package.dependencies]
 MarkupSafe = ">=2.1.1"

 [package.extras]
-watchdog = ["watchdog (>=2.3)"]
+watchdog = ["watchdog"]

 [[package]]
 name = "wrapt"
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
+content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -5,7 +5,7 @@ mod link;
 pub use link::LinkAuthError;
 use tokio_postgres::config::AuthKeys;

-use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
+use crate::proxy::{handle_try_wake, retry_after};
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -134,14 +134,13 @@ async fn auth_quirks_creds(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
    if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(creds, client, latency_timer).await;
+        return hacks::password_hack(creds, client).await;
    }

    // Password hack should set the project name.
@@ -152,11 +151,11 @@ async fn auth_quirks_creds(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(client, latency_timer).await;
+        return hacks::cleartext_hack(client).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client, config, latency_timer).await
+    classic::authenticate(api, extra, creds, client, config).await
 }

 /// True to its name, this function encapsulates our current auth trade-offs.
@@ -168,18 +167,8 @@ async fn auth_quirks(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let auth_stuff = auth_quirks_creds(
-        api,
-        extra,
-        creds,
-        client,
-        allow_cleartext,
-        config,
-        latency_timer,
-    )
-    .await?;
+    let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;

    let mut num_retries = 0;
    let mut node = loop {
@@ -244,7 +233,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
-        latency_timer: &mut LatencyTimer,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

@@ -257,16 +245,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(
-                    api,
-                    extra,
-                    creds,
-                    client,
-                    allow_cleartext,
-                    config,
-                    latency_timer,
-                )
-                .await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            Postgres(api, creds) => {
                info!(
@@ -276,16 +255,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(
-                    api,
-                    extra,
-                    creds,
-                    client,
-                    allow_cleartext,
-                    config,
-                    latency_timer,
-                )
-                .await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,6 @@ use crate::{
    compute,
    config::AuthenticationConfig,
    console::{self, AuthInfo, ConsoleReqExtra},
-    proxy::LatencyTimer,
    sasl, scram,
    stream::PqStream,
 };
@@ -17,7 +16,6 @@ pub(super) async fn authenticate(
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
@@ -38,26 +36,24 @@ pub(super) async fn authenticate(
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);

+            let auth_flow = flow.begin(scram).await.map_err(|error| {
+                warn!(?error, "error sending scram acknowledgement");
+                error
+            })?;
+
            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
-                async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
-
-                    flow.begin(scram).await.map_err(|error| {
-                        warn!(?error, "error sending scram acknowledgement");
-                        error
-                    })?.authenticate().await.map_err(|error| {
-                        warn!(?error, "error processing scram messages");
-                        error
-                    })
-                }
+                auth_flow.authenticate(),
            )
            .await
            .map_err(|error| {
                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
-            })??;
+            })?
+            .map_err(|error| {
+                warn!(?error, "error processing scram messages");
+                error
+            })?;

            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,7 +1,6 @@
 use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    proxy::LatencyTimer,
    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -13,13 +12,8 @@ use tracing::{info, warn};
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");
-
-    // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
-
    let password = AuthFlow::new(client)
        .begin(auth::CleartextPassword)
        .await?
@@ -38,13 +32,8 @@ pub async fn cleartext_hack(
 pub async fn password_hack(
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");
-
-    // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
-
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
        .await?
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -4,11 +4,10 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
-use proxy::usage_metrics;
+use proxy::metrics;

 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
-use proxy::serverless;
 use std::pin::pin;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
@@ -16,10 +15,9 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::{project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);

 use clap::{Parser, ValueEnum};

@@ -101,8 +99,7 @@ async fn main() -> anyhow::Result<()> {
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

    info!("Version: {GIT_VERSION}");
-    info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    ::metrics::set_build_info_metric(GIT_VERSION);

    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;
@@ -132,16 +129,14 @@ async fn main() -> anyhow::Result<()> {
        cancellation_token.clone(),
    ));

-    // TODO: rename the argument to something like serverless.
-    // It now covers more than just websockets, it also covers SQL over HTTP.
-    if let Some(serverless_address) = args.wss {
-        let serverless_address: SocketAddr = serverless_address.parse()?;
-        info!("Starting wss on {serverless_address}");
-        let serverless_listener = TcpListener::bind(serverless_address).await?;
+    if let Some(wss_address) = args.wss {
+        let wss_address: SocketAddr = wss_address.parse()?;
+        info!("Starting wss on {wss_address}");
+        let wss_listener = TcpListener::bind(wss_address).await?;

-        client_tasks.spawn(serverless::task_main(
+        client_tasks.spawn(http::websocket::task_main(
            config,
-            serverless_listener,
+            wss_listener,
            cancellation_token.clone(),
        ));
    }
@@ -149,11 +144,11 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
-        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        maintenance_tasks.spawn(metrics::task_main(metrics_config));
    }

    let maintenance = loop {
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -223,7 +223,7 @@ pub struct CacheOptions {

 impl CacheOptions {
    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -13,7 +13,6 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -188,31 +187,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn parse_wake_compute() -> anyhow::Result<()> {
-        let json = json!({
-            "address": "0.0.0.0",
-            "aux": dummy_aux(),
-        });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
-        Ok(())
-    }
-
-    #[test]
-    fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-            "allowed_ips": ["8.8.8.8"],
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-
-        Ok(())
-    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -97,6 +97,8 @@ pub mod errors {
                    !text.contains("quota exceeded")
                        && !text.contains("the limit for current plan reached")
                }
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
            }
        }
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -59,7 +59,7 @@ impl Api {
            let rows = client.query(query, &[&creds.user]).await?;

            // We can get at most one row, because `rolname` is unique.
-            let row = match rows.first() {
+            let row = match rows.get(0) {
                Some(row) => row,
                // This means that the user doesn't exist, so there can be no secret.
                // However, this is still a *valid* outcome which is very similar
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -49,7 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,7 +2,10 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).

-pub mod health_server;
+pub mod conn_pool;
+pub mod server;
+pub mod sql_over_http;
+pub mod websocket;

 use std::{sync::Arc, time::Duration};

--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,8 +22,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
+    metrics::{Ids, MetricCounter, USAGE_METRICS},
    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
-    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};

@@ -191,39 +191,22 @@ impl GlobalConnPool {
        // ok return cached connection if found and establish a new one otherwise
        let new_client = if let Some(client) = client {
            if client.inner.is_closed() {
-                let conn_id = uuid::Uuid::new_v4();
-                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(
-                    self.proxy_config,
-                    conn_info,
-                    conn_id,
-                    session_id,
-                    latency_timer,
-                )
-                .await
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
                latency_timer.pool_hit();
                latency_timer.success();
                return Ok(Client {
-                    conn_id: client.conn_id,
                    inner: Some(client),
                    span: Span::current(),
                    pool,
                });
            }
        } else {
-            let conn_id = uuid::Uuid::new_v4();
-            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-            connect_to_compute(
-                self.proxy_config,
-                conn_info,
-                conn_id,
-                session_id,
-                latency_timer,
-            )
-            .await
+            info!("pool: opening a new connection '{conn_info}'");
+            connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
        };

        match &new_client {
@@ -260,7 +243,6 @@ impl GlobalConnPool {
        }

        new_client.map(|inner| Client {
-            conn_id: inner.conn_id,
            inner: Some(inner),
            span: Span::current(),
            pool,
@@ -268,18 +250,16 @@ impl GlobalConnPool {
    }

    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
-        let conn_id = client.conn_id;
-
        // We want to hold this open while we return. This ensures that the pool can't close
        // while we are in the middle of returning the connection.
        let closed = self.closed.read();
        if *closed {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
+            info!("pool: throwing away connection '{conn_info}' because pool is closed");
            return Ok(());
        }

        if client.inner.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            info!("pool: throwing away connection '{conn_info}' because connection is closed");
            return Ok(());
        }

@@ -311,9 +291,9 @@ impl GlobalConnPool {

        // do logging outside of the mutex
        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
        }

        Ok(())
@@ -360,7 +340,6 @@ impl GlobalConnPool {
 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
    session_id: uuid::Uuid,
-    conn_id: uuid::Uuid,
 }

 #[async_trait]
@@ -374,14 +353,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(
-            node_info,
-            self.conn_info,
-            timeout,
-            self.conn_id,
-            self.session_id,
-        )
-        .await
+        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -394,7 +366,6 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-    conn_id: uuid::Uuid,
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
 ) -> anyhow::Result<ClientInner> {
@@ -430,7 +401,6 @@ async fn connect_to_compute(

    crate::proxy::connect_to_compute(
        &TokioMechanism {
-            conn_id,
            conn_info,
            session_id,
        },
@@ -446,7 +416,6 @@ async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-    conn_id: uuid::Uuid,
    mut session: uuid::Uuid,
 ) -> Result<ClientInner, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();
@@ -461,6 +430,7 @@ async fn connect_to_compute_once(

    let (tx, mut rx) = tokio::sync::watch::channel(session);

+    let conn_id = uuid::Uuid::new_v4();
    let span = info_span!(parent: None, "connection", %conn_id);
    span.in_scope(|| {
        info!(%conn_info, %session, "new connection");
@@ -514,7 +484,6 @@ async fn connect_to_compute_once(
        inner: client,
        session: tx,
        ids,
-        conn_id,
    })
 }

@@ -522,7 +491,6 @@ struct ClientInner {
    inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
    ids: Ids,
-    conn_id: uuid::Uuid,
 }

 impl Client {
@@ -532,14 +500,12 @@ impl Client {
 }

 pub struct Client {
-    conn_id: uuid::Uuid,
    span: Span,
    inner: Option<ClientInner>,
    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

 pub struct Discard<'a> {
-    conn_id: uuid::Uuid,
    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

@@ -548,7 +514,6 @@ impl Client {
        let Self {
            inner,
            pool,
-            conn_id,
            span: _,
        } = self;
        (
@@ -556,10 +521,7 @@ impl Client {
                .as_mut()
                .expect("client inner should not be removed")
                .inner,
-            Discard {
-                pool,
-                conn_id: *conn_id,
-            },
+            Discard { pool },
        )
    }

@@ -575,13 +537,13 @@ impl Discard<'_> {
    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
        if status != ReadyForQueryStatus::Idle {
            if let Some((conn_info, _)) = self.pool.take() {
-                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+                info!("pool: throwing away connection '{conn_info}' because connection is not idle")
            }
        }
    }
    pub fn discard(&mut self) {
        if let Some((conn_info, _)) = self.pool.take() {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
        }
    }
 }
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -1,36 +1,235 @@
-//! Routers for our serverless APIs
-//!
-//! Handles both SQL over HTTP and SQL over Websockets.
-
-mod conn_pool;
-mod sql_over_http;
-mod websocket;
-
+use crate::{
+    cancellation::CancelMap,
+    config::ProxyConfig,
+    error::io_error,
+    protocol2::{ProxyProtocolAccept, WithClientIp},
+    proxy::{
+        handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
+        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
+    },
+};
 use anyhow::bail;
-use hyper::StatusCode;
-pub use reqwest_middleware::{ClientWithMiddleware, Error};
-pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-
-use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
-use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
-use crate::{cancellation::CancelMap, config::ProxyConfig};
-use futures::StreamExt;
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream, StreamExt};
 use hyper::{
    server::{
        accept,
        conn::{AddrIncoming, AddrStream},
    },
-    Body, Method, Request, Response,
+    upgrade::Upgraded,
+    Body, Method, Request, Response, StatusCode,
 };
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use pin_project_lite::pin_project;

-use std::task::Poll;
-use std::{future::ready, sync::Arc};
+use std::{
+    future::ready,
+    pin::Pin,
+    sync::Arc,
+    task::{ready, Context, Poll},
+};
 use tls_listener::TlsListener;
-use tokio::net::TcpListener;
+use tokio::{
+    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
+    net::TcpListener,
+};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

+// TODO: use `std::sync::Exclusive` once it's stabilized.
+// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
+use sync_wrapper::SyncWrapper;
+
+use super::{conn_pool::GlobalConnPool, sql_over_http};
+
+pin_project! {
+    /// This is a wrapper around a [`WebSocketStream`] that
+    /// implements [`AsyncRead`] and [`AsyncWrite`].
+    pub struct WebSocketRw {
+        #[pin]
+        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        bytes: Bytes,
+    }
+}
+
+impl WebSocketRw {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream: stream.into(),
+            bytes: Bytes::new(),
+        }
+    }
+}
+
+impl AsyncWrite for WebSocketRw {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let mut stream = self.project().stream.get_pin_mut();
+
+        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+            Ok(()) => Poll::Ready(Ok(buf.len())),
+            Err(e) => Poll::Ready(Err(io_error(e))),
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_flush(cx).map_err(io_error)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_close(cx).map_err(io_error)
+    }
+}
+
+impl AsyncRead for WebSocketRw {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() > 0 {
+            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+            let len = std::cmp::min(bytes.len(), buf.remaining());
+            buf.put_slice(&bytes[..len]);
+            self.consume(len);
+        }
+
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRw {
+    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        // Please refer to poll_fill_buf's documentation.
+        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
+
+        let mut this = self.project();
+        loop {
+            if !this.bytes.chunk().is_empty() {
+                let chunk = (*this.bytes).chunk();
+                return Poll::Ready(Ok(chunk));
+            }
+
+            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            match res.transpose().map_err(io_error)? {
+                Some(message) => match message {
+                    Message::Ping(_) => {}
+                    Message::Pong(_) => {}
+                    Message::Text(text) => {
+                        // We expect to see only binary messages.
+                        let error = "unexpected text message in the websocket";
+                        warn!(length = text.len(), error);
+                        return Poll::Ready(Err(io_error(error)));
+                    }
+                    Message::Frame(_) => {
+                        // This case is impossible according to Frame's doc.
+                        panic!("unexpected raw frame in the websocket");
+                    }
+                    Message::Binary(chunk) => {
+                        assert!(this.bytes.is_empty());
+                        *this.bytes = Bytes::from(chunk);
+                    }
+                    Message::Close(_) => return EOF,
+                },
+                None => return EOF,
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amount: usize) {
+        self.project().bytes.advance(amount);
+    }
+}
+
+async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &'static ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRw::new(websocket),
+        ClientMode::Websockets { hostname },
+    )
+    .await?;
+    Ok(())
+}
+
+async fn ws_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    conn_pool: Arc<GlobalConnPool>,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
+            }
+            .in_current_span(),
+        );
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
+    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        sql_over_http::handle(
+            request,
+            sni_hostname,
+            conn_pool,
+            session_id,
+            &config.http_config,
+        )
+        .await
+    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+        Response::builder()
+            .header("Allow", "OPTIONS, POST")
+            .header("Access-Control-Allow-Origin", "*")
+            .header(
+                "Access-Control-Allow-Headers",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+            )
+            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
+            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
+            .body(Body::empty())
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    } else {
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
+    }
+}
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -40,7 +239,7 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

-    let conn_pool = conn_pool::GlobalConnPool::new(config);
+    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);

    // shutdown the connection pool
    tokio::spawn({
@@ -101,15 +300,13 @@ pub async fn task_main(
                            let cancel_map = Arc::new(CancelMap::default());
                            let session_id = uuid::Uuid::new_v4();

-                            request_handler(
-                                req, config, conn_pool, cancel_map, session_id, sni_name,
-                            )
-                            .instrument(info_span!(
-                                "serverless",
-                                session = %session_id,
-                                %peer_addr,
-                            ))
-                            .await
+                            ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
+                                .instrument(info_span!(
+                                    "ws-client",
+                                    session = %session_id,
+                                    %peer_addr,
+                                ))
+                                .await
                        }
                    },
                )))
@@ -162,65 +359,3 @@ where
        self.inner.call(req)
    }
 }
-
-async fn request_handler(
-    mut request: Request<Body>,
-    config: &'static ProxyConfig,
-    conn_pool: Arc<conn_pool::GlobalConnPool>,
-    cancel_map: Arc<CancelMap>,
-    session_id: uuid::Uuid,
-    sni_hostname: Option<String>,
-) -> Result<Response<Body>, ApiError> {
-    let host = request
-        .headers()
-        .get("host")
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.split(':').next())
-        .map(|s| s.to_string());
-
-    // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
-
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
-            .map_err(|e| ApiError::BadRequest(e.into()))?;
-
-        tokio::spawn(
-            async move {
-                if let Err(e) =
-                    websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
-                        .await
-                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
-                }
-            }
-            .in_current_span(),
-        );
-
-        // Return the response so the spawned future can continue.
-        Ok(response)
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(
-            request,
-            sni_hostname,
-            conn_pool,
-            session_id,
-            &config.http_config,
-        )
-        .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
-        Response::builder()
-            .header("Allow", "OPTIONS, POST")
-            .header("Access-Control-Allow-Origin", "*")
-            .header(
-                "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
-            )
-            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
-            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
-            .map_err(|e| ApiError::InternalServerError(e.into()))
-    } else {
-        json_response(StatusCode::BAD_REQUEST, "query is not supported")
-    }
-}
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -14,15 +14,14 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
+pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod sasl;
 pub mod scram;
-pub mod serverless;
 pub mod stream;
 pub mod url;
-pub mod usage_metrics;
 pub mod waiters;

 /// Handle unix signals appropriately.
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -8,9 +8,9 @@ use crate::{
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    http::StatusCode,
+    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -106,26 +106,17 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
 });

 pub struct LatencyTimer {
-    // time since the stopwatch was started
-    start: Option<Instant>,
-    // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
-    // label data
+    start: Instant,
    protocol: &'static str,
    cache_miss: bool,
    pool_miss: bool,
    outcome: &'static str,
 }

-pub struct LatencyTimerPause<'a> {
-    timer: &'a mut LatencyTimer,
-}
-
 impl LatencyTimer {
    pub fn new(protocol: &'static str) -> Self {
        Self {
-            start: Some(Instant::now()),
-            accumulated: std::time::Duration::ZERO,
+            start: Instant::now(),
            protocol,
            cache_miss: false,
            // by default we don't do pooling
@@ -135,13 +126,6 @@ impl LatencyTimer {
        }
    }

-    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
-        // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
-        LatencyTimerPause { timer: self }
-    }
-
    pub fn cache_miss(&mut self) {
        self.cache_miss = true;
    }
@@ -155,17 +139,9 @@ impl LatencyTimer {
    }
 }

-impl Drop for LatencyTimerPause<'_> {
-    fn drop(&mut self) {
-        // start the stopwatch again
-        self.timer.start = Some(Instant::now());
-    }
-}
-
 impl Drop for LatencyTimer {
    fn drop(&mut self) {
-        let duration =
-            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
+        let duration = self.start.elapsed().as_secs_f64();
        COMPUTE_CONNECTION_LATENCY
            .with_label_values(&[
                self.protocol,
@@ -173,7 +149,7 @@ impl Drop for LatencyTimer {
                bool_to_str(self.pool_miss),
                self.outcome,
            ])
-            .observe(duration.as_secs_f64())
+            .observe(duration)
    }
 }

@@ -195,7 +171,7 @@ static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

-static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes_per_client",
        "Number of bytes sent/received between client and backend.",
@@ -204,15 +180,6 @@ static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(||
    .unwrap()
 });

-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
 pub async fn task_main(
    config: &'static ProxyConfig,
    listener: tokio::net::TcpListener,
@@ -797,28 +764,24 @@ pub async fn proxy_pass(
        branch_id: aux.branch_id.to_string(),
    });

-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
    let mut client = MeasuredStream::new(
        client,
        |_| {},
        |cnt| {
            // Number of bytes we sent to the client (outbound).
            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
            usage.record_egress(cnt as u64);
        },
    );

-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("rx"));
    let mut compute = MeasuredStream::new(
        compute,
        |_| {},
        |cnt| {
            // Number of bytes the client sent to the compute node (inbound).
            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
        },
    );

@@ -886,16 +849,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            application_name: params.get("application_name"),
        };

-        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
+        let latency_timer = LatencyTimer::new(mode.protocol_label());

        let auth_result = match creds
-            .authenticate(
-                &extra,
-                &mut stream,
-                mode.allow_cleartext(),
-                config,
-                &mut latency_timer,
-            )
+            .authenticate(&extra, &mut stream, mode.allow_cleartext(), config)
            .await
        {
            Ok(auth_result) => auth_result,
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -409,7 +409,7 @@ impl TestBackend for TestConnectMechanism {
            }
            ConnectAction::WakeRetry => {
                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::BAD_REQUEST,
+                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
                    text: "TEST".into(),
                };
                assert!(err.could_retry());
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -18,6 +18,7 @@ mod password;
 pub use exchange::Exchange;
 pub use key::ScramKey;
 pub use secret::ServerSecret;
+pub use secret::*;

 use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,146 +0,0 @@
-use crate::{
-    cancellation::CancelMap,
-    config::ProxyConfig,
-    error::io_error,
-    proxy::{handle_client, ClientMode},
-};
-use bytes::{Buf, Bytes};
-use futures::{Sink, Stream};
-use hyper::upgrade::Upgraded;
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
-use pin_project_lite::pin_project;
-
-use std::{
-    pin::Pin,
-    task::{ready, Context, Poll},
-};
-use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
-use tracing::warn;
-
-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
-pin_project! {
-    /// This is a wrapper around a [`WebSocketStream`] that
-    /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw {
-        #[pin]
-        stream: SyncWrapper<WebSocketStream<Upgraded>>,
-        bytes: Bytes,
-    }
-}
-
-impl WebSocketRw {
-    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
-        Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
-        }
-    }
-}
-
-impl AsyncWrite for WebSocketRw {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
-
-        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
-            Ok(()) => Poll::Ready(Ok(buf.len())),
-            Err(e) => Poll::Ready(Err(io_error(e))),
-        }
-    }
-
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_flush(cx).map_err(io_error)
-    }
-
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_close(cx).map_err(io_error)
-    }
-}
-
-impl AsyncRead for WebSocketRw {
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
-        Poll::Ready(Ok(()))
-    }
-}
-
-impl AsyncBufRead for WebSocketRw {
-    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
-        // Please refer to poll_fill_buf's documentation.
-        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
-
-        let mut this = self.project();
-        loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
-                return Poll::Ready(Ok(chunk));
-            }
-
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
-            match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
-                        // We expect to see only binary messages.
-                        let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
-                        return Poll::Ready(Err(io_error(error)));
-                    }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
-                    }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
-                },
-                None => return EOF,
-            }
-        }
-    }
-
-    fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
-    }
-}
-
-pub async fn serve_websocket(
-    websocket: HyperWebsocket,
-    config: &'static ProxyConfig,
-    cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
-    hostname: Option<String>,
-) -> anyhow::Result<()> {
-    let websocket = websocket.await?;
-    handle_client(
-        config,
-        cancel_map,
-        session_id,
-        WebSocketRw::new(websocket),
-        ClientMode::Websockets { hostname },
-    )
-    .await?;
-    Ok(())
-}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^3.0.1"
+Werkzeug = "^2.2.3"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -25,64 +25,57 @@ _This section is only relevant if using a command that requires access to Neon's

 ### Commands

-#### `find-garbage`
+#### `tidy`

-Walk an S3 bucket and cross-reference the contents with the Console API to identify data for
-tenants or timelines that should no longer exist.
+Iterate over S3 buckets for storage nodes, checking their contents and removing the data not present in the console. Node S3 data that's not removed is then further checked for discrepancies and, sometimes, validated.
+
+Unless the global `--delete` argument is provided, this command only dry-runs and logs
+what it would have deleted.
+
+```
+tidy --node-kind=<safekeeper|pageserver> [--depth=<tenant|timeline>] [--skip-validation]
+```

 - `--node-kind`: whether to inspect safekeeper or pageserver bucket prefix
 - `--depth`: whether to only search for deletable tenants, or also search for
  deletable timelines within active tenants. Default: `tenant`
- `--output-path`: filename to write garbage list to.  Default `garbage.json`
+- `--skip-validation`: skip additional post-deletion checks. Default: `false`

-This command outputs a JSON file describing tenants and timelines to remove, for subsequent
-processing by the `purge-garbage` subcommand.
+For a selected S3 path, the tool lists the S3 bucket given for either tenants or both tenants and timelines — for every found entry, console API is queried: any deleted or missing in the API entity is scheduled for deletion from S3.

-**Note that the garbage list format is not stable.  The output of `find-garbage` is only
-  intended for use by the exact same version of the tool running `purge-garbage`**
+If validation is enabled, only the non-deleted tenants' ones are checked.
+For pageserver, timelines' index_part.json on S3 is also checked for various discrepancies: no files are removed, even if there are "extra" S3 files not present in index_part.json: due to the way pageserver updates the remote storage, it's better to do such removals manually, stopping the corresponding tenant first.

-Example:
+Command examples:

-`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env SSO_ACCOUNT_ID=369495373322 REGION=eu-west-1 BUCKET=neon-dev-storage-eu-west-1 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=safekeeper`

-#### `purge-garbage`
+`env SSO_ACCOUNT_ID=369495373322 REGION=us-east-2 BUCKET=neon-staging-storage-us-east-2 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=pageserver --depth=timeline`

-Consume a garbage list from `find-garbage`, and delete the related objects in the S3 bucket.
+When dry run stats look satisfying, use `-- --delete` before the `tidy` command to
+disable dry run and run the binary with deletion enabled.

- `--input-path`: filename to read garbage list from.  Default `garbage.json`.
- `--mode`: controls whether to purge only garbage that was specifically marked
-            deleted in the control plane (`deletedonly`), or also to purge tenants/timelines
-            that were not present in the control plane at all (`deletedandmissing`)
+See these lines (and lines around) in the logs for the final stats:

-This command learns region/bucket details from the garbage file, so it is not necessary
-to pass them on the command line
+- `Finished listing the bucket for tenants`
+- `Finished active tenant and timeline validation`
+- `Total tenant deletion stats`
+- `Total timeline deletion stats`

-Example:
+## Current implementation details

-`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+- The tool does not have any peristent state currently: instead, it creates very verbose logs, with every S3 delete request logged, every tenant/timeline id check, etc.
+  Worse, any panic or early errored tasks might force the tool to exit without printing the final summary — all affected ids will still be in the logs though. The tool has retries inside it, so it's error-resistant up to some extent, and recent runs showed no traces of errors/panics.

-Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
-not provided inline in the example above to avoid accidents.  Without the `--delete` flag
-the purge command will log all the keys that it would have deleted.
+- Instead of checking non-deleted tenants' timelines instantly, the tool attempts to create separate tasks (futures) for that,
+  complicating the logic and slowing down the process, this should be fixed and done in one "task".

-#### `scan-metadata`
+- The tool does uses only publicly available remote resources (S3, console) and does not access pageserver/safekeeper nodes themselves.
+  Yet, its S3 set up should be prepared for running on any pageserver/safekeeper node, using node's S3 credentials, so the node API access logic could be implemented relatively simply on top.

-Walk objects in a pageserver S3 bucket, and report statistics on the contents.
+## Cleanup procedure:

-```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
-
-Timelines: 31106
-With errors: 3
-With warnings: 13942
-With garbage: 0
-Index versions: 2: 13942, 4: 17162
-Timeline size bytes: min 22413312, 1% 52133887, 10% 56459263, 50% 101711871, 90% 191561727, 99% 280887295, max 167535558656
-Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 201457663, max 275324928
-Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
-```
-
-## Cleaning up running pageservers
+### Pageserver preparations

 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.

--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,27 +1,178 @@
-use std::collections::HashSet;
+use std::collections::{hash_map, HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Duration;

 use anyhow::Context;
 use aws_sdk_s3::Client;
-use tracing::{error, info, warn};
+use tokio::task::JoinSet;
+use tracing::{error, info, info_span, warn, Instrument};

-use crate::cloud_admin_api::BranchData;
-use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId};
+use crate::delete_batch_producer::DeleteProducerStats;
+use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES};
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
 use utils::id::TenantTimelineId;

-pub(crate) struct TimelineAnalysis {
+pub async fn validate_pageserver_active_tenant_and_timelines(
+    s3_client: Arc<Client>,
+    s3_root: RootTarget,
+    admin_client: Arc<CloudAdminApiClient>,
+    batch_producer_stats: DeleteProducerStats,
+) -> anyhow::Result<BranchCheckStats> {
+    let Some(timeline_stats) = batch_producer_stats.timeline_stats else {
+        info!("No tenant-only checks, exiting");
+        return Ok(BranchCheckStats::default());
+    };
+
+    let s3_active_projects = batch_producer_stats
+        .tenant_stats
+        .active_entries
+        .into_iter()
+        .map(|project| (project.id.clone(), project))
+        .collect::<HashMap<_, _>>();
+    info!("Validating {} active tenants", s3_active_projects.len());
+
+    let mut s3_active_branches_per_project = HashMap::<ProjectId, Vec<BranchData>>::new();
+    let mut s3_blob_data = HashMap::<TenantTimelineId, S3TimelineBlobData>::new();
+    for active_branch in timeline_stats.active_entries {
+        let active_project_id = active_branch.project_id.clone();
+        let active_branch_id = active_branch.id.clone();
+        let active_timeline_id = active_branch.timeline_id;
+
+        s3_active_branches_per_project
+            .entry(active_project_id.clone())
+            .or_default()
+            .push(active_branch);
+
+        let Some(active_project) = s3_active_projects.get(&active_project_id) else {
+            error!(
+                "Branch {:?} for project {:?} has no such project in the active projects",
+                active_branch_id, active_project_id
+            );
+            continue;
+        };
+
+        let id = TenantTimelineId::new(active_project.tenant, active_timeline_id);
+        s3_blob_data.insert(
+            id,
+            list_timeline_blobs(&s3_client, id, &s3_root)
+                .await
+                .with_context(|| format!("List timeline {id} blobs"))?,
+        );
+    }
+
+    let mut branch_checks = JoinSet::new();
+    for (_, s3_active_project) in s3_active_projects {
+        let project_id = &s3_active_project.id;
+        let tenant_id = s3_active_project.tenant;
+
+        let mut console_active_branches =
+            branches_for_project_with_retries(&admin_client, project_id)
+                .await
+                .with_context(|| {
+                    format!("Client API branches for project {project_id:?} retrieval")
+                })?
+                .into_iter()
+                .map(|branch| (branch.id.clone(), branch))
+                .collect::<HashMap<_, _>>();
+
+        let active_branches = s3_active_branches_per_project
+            .remove(project_id)
+            .unwrap_or_default();
+        info!(
+            "Spawning tasks for {} tenant {} active timelines",
+            active_branches.len(),
+            tenant_id
+        );
+        for s3_active_branch in active_branches {
+            let console_branch = console_active_branches.remove(&s3_active_branch.id);
+            let timeline_id = s3_active_branch.timeline_id;
+            let id = TenantTimelineId::new(tenant_id, timeline_id);
+            let s3_data = s3_blob_data.remove(&id);
+            let s3_root = s3_root.clone();
+            branch_checks.spawn(
+                async move {
+                    let check_errors = branch_cleanup_and_check_errors(
+                        &id,
+                        &s3_root,
+                        Some(&s3_active_branch),
+                        console_branch,
+                        s3_data,
+                    )
+                    .await;
+                    (id, check_errors)
+                }
+                .instrument(info_span!("check_timeline", id = %id)),
+            );
+        }
+    }
+
+    let mut total_stats = BranchCheckStats::default();
+    while let Some((id, analysis)) = branch_checks
+        .join_next()
+        .await
+        .transpose()
+        .context("branch check task join")?
+    {
+        total_stats.add(id, analysis.errors);
+    }
+    Ok(total_stats)
+}
+
+async fn branches_for_project_with_retries(
+    admin_client: &CloudAdminApiClient,
+    project_id: &ProjectId,
+) -> anyhow::Result<Vec<BranchData>> {
+    for _ in 0..MAX_RETRIES {
+        match admin_client.branches_for_project(project_id, false).await {
+            Ok(branches) => return Ok(branches),
+            Err(e) => {
+                error!("admin list branches for project {project_id:?} query failed: {e}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+
+    anyhow::bail!("Failed to list branches for project {project_id:?} {MAX_RETRIES} times")
+}
+
+#[derive(Debug, Default)]
+pub struct BranchCheckStats {
+    pub timelines_with_errors: HashMap<TenantTimelineId, Vec<String>>,
+    pub normal_timelines: HashSet<TenantTimelineId>,
+}
+
+impl BranchCheckStats {
+    pub fn add(&mut self, id: TenantTimelineId, check_errors: Vec<String>) {
+        if check_errors.is_empty() {
+            if !self.normal_timelines.insert(id) {
+                panic!("Checking branch with timeline {id} more than once")
+            }
+        } else {
+            match self.timelines_with_errors.entry(id) {
+                hash_map::Entry::Occupied(_) => {
+                    panic!("Checking branch with timeline {id} more than once")
+                }
+                hash_map::Entry::Vacant(v) => {
+                    v.insert(check_errors);
+                }
+            }
+        }
+    }
+}
+
+pub struct TimelineAnalysis {
    /// Anomalies detected
-    pub(crate) errors: Vec<String>,
+    pub errors: Vec<String>,

    /// Healthy-but-noteworthy, like old-versioned structures that are readable but
    /// worth reporting for awareness that we must not remove that old version decoding
    /// yet.
-    pub(crate) warnings: Vec<String>,
+    pub warnings: Vec<String>,

-    /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
-    /// of races between reading the metadata and reading the objects.
-    pub(crate) garbage_keys: Vec<String>,
+    /// Keys not referenced in metadata: candidates for removal
+    pub garbage_keys: Vec<String>,
 }

 impl TimelineAnalysis {
@@ -34,7 +185,7 @@ impl TimelineAnalysis {
    }
 }

-pub(crate) async fn branch_cleanup_and_check_errors(
+pub async fn branch_cleanup_and_check_errors(
    id: &TenantTimelineId,
    s3_root: &RootTarget,
    s3_active_branch: Option<&BranchData>,
@@ -169,13 +320,13 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 }

 #[derive(Debug)]
-pub(crate) struct S3TimelineBlobData {
-    pub(crate) blob_data: BlobDataParseResult,
-    pub(crate) keys_to_remove: Vec<String>,
+pub struct S3TimelineBlobData {
+    pub blob_data: BlobDataParseResult,
+    pub keys_to_remove: Vec<String>,
 }

 #[derive(Debug)]
-pub(crate) enum BlobDataParseResult {
+pub enum BlobDataParseResult {
    Parsed {
        index_part: IndexPart,
        s3_layers: HashSet<LayerFileName>,
@@ -183,7 +334,7 @@ pub(crate) enum BlobDataParseResult {
    Incorrect(Vec<String>),
 }

-pub(crate) async fn list_timeline_blobs(
+pub async fn list_timeline_blobs(
    s3_client: &Client,
    id: TenantTimelineId,
    s3_root: &RootTarget,
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,19 +1,12 @@
 #![allow(unused)]

-use std::str::FromStr;
-use std::time::Duration;
-
 use chrono::{DateTime, Utc};
-use hex::FromHex;
-use reqwest::{header, Client, StatusCode, Url};
-use serde::Deserialize;
+use reqwest::{header, Client, Url};
 use tokio::sync::Semaphore;

 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

-use crate::ConsoleConfig;
-
 #[derive(Debug)]
 pub struct Error {
    context: String,
@@ -41,9 +34,6 @@ impl std::fmt::Display for Error {
                    self.context, e
                )
            }
-            ErrorKind::ResponseStatus(status) => {
-                write!(f, "Bad response status {}: {}", status, self.context)
-            }
            ErrorKind::UnexpectedState => write!(f, "Unexpected state: {}", self.context),
        }
    }
@@ -63,7 +53,6 @@ impl std::error::Error for Error {}
 pub enum ErrorKind {
    RequestSend(reqwest::Error),
    BodyRead(reqwest::Error),
-    ResponseStatus(StatusCode),
    UnexpectedState,
 }

@@ -111,23 +100,7 @@ pub struct SafekeeperData {
    pub availability_zone_id: String,
 }

-/// For ID fields, the Console API does not always return a value or null.  It will
-/// sometimes return an empty string.  Our native Id type does not consider this acceptable
-/// (nor should it), so we use a wrapper for talking to the Console API.
-fn from_nullable_id<'de, D>(deserializer: D) -> Result<TenantId, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let id_str = String::deserialize(deserializer)?;
-    if id_str.is_empty() {
-        // This is a bogus value, but for the purposes of the scrubber all that
-        // matters is that it doesn't collide with any real IDs.
-        Ok(TenantId::from([0u8; 16]))
-    } else {
-        TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
-    }
-}
-
+#[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub struct ProjectData {
    pub id: ProjectId,
@@ -136,7 +109,7 @@ pub struct ProjectData {
    pub platform_id: String,
    pub user_id: String,
    pub pageserver_id: u64,
-    #[serde(deserialize_with = "from_nullable_id")]
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub tenant: TenantId,
    pub safekeepers: Vec<SafekeeperData>,
    pub deleted: bool,
@@ -175,27 +148,11 @@ pub struct BranchData {
    pub written_size: Option<u64>,
 }

-pub trait MaybeDeleted {
-    fn is_deleted(&self) -> bool;
-}
-
-impl MaybeDeleted for ProjectData {
-    fn is_deleted(&self) -> bool {
-        self.deleted
-    }
-}
-
-impl MaybeDeleted for BranchData {
-    fn is_deleted(&self) -> bool {
-        self.deleted
-    }
-}
-
 impl CloudAdminApiClient {
-    pub fn new(config: ConsoleConfig) -> Self {
+    pub fn new(token: String, base_url: Url) -> Self {
        Self {
-            token: config.token,
-            base_url: config.base_url,
+            token,
+            base_url,
            request_limiter: Semaphore::new(200),
            http_client: Client::new(), // TODO timeout configs at least
        }
@@ -251,81 +208,6 @@ impl CloudAdminApiClient {
        }
    }

-    pub async fn list_projects(&self, region_id: String) -> Result<Vec<ProjectData>, Error> {
-        let _permit = self
-            .request_limiter
-            .acquire()
-            .await
-            .expect("Semaphore is not closed");
-
-        let mut pagination_offset = 0;
-        const PAGINATION_LIMIT: usize = 512;
-        let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
-        loop {
-            let response = self
-                .http_client
-                .get(self.append_url("/projects"))
-                .query(&[
-                    ("show_deleted", "false".to_string()),
-                    ("limit", format!("{PAGINATION_LIMIT}")),
-                    ("offset", format!("{pagination_offset}")),
-                ])
-                .header(header::ACCEPT, "application/json")
-                .bearer_auth(&self.token)
-                .send()
-                .await
-                .map_err(|e| {
-                    Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::RequestSend(e),
-                    )
-                })?;
-
-            match response.status() {
-                StatusCode::OK => {}
-                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
-                    tokio::time::sleep(Duration::from_millis(500)).await;
-                    continue;
-                }
-                status => {
-                    return Err(Error::new(
-                        "List active projects".to_string(),
-                        ErrorKind::ResponseStatus(response.status()),
-                    ))
-                }
-            }
-
-            let response_bytes = response.bytes().await.map_err(|e| {
-                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
-            })?;
-
-            let decode_result =
-                serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
-
-            let mut response = match decode_result {
-                Ok(r) => r,
-                Err(decode) => {
-                    tracing::error!(
-                        "Failed to decode response body: {}\n{}",
-                        decode,
-                        String::from_utf8(response_bytes.to_vec()).unwrap()
-                    );
-                    panic!("we out");
-                }
-            };
-
-            pagination_offset += response.data.len();
-
-            result.extend(response.data.drain(..).filter(|t| t.region_id == region_id));
-
-            if pagination_offset >= response.total.unwrap_or(0) {
-                break;
-            }
-        }
-
-        Ok(result)
-    }
-
    pub async fn find_timeline_branch(
        &self,
        timeline_id: TimelineId,
--- a/s3_scrubber/src/delete_batch_producer.rs
+++ b/s3_scrubber/src/delete_batch_producer.rs
@@ -0,0 +1,354 @@
+mod tenant_batch;
+mod timeline_batch;
+
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::Mutex;
+use tokio::task::{JoinHandle, JoinSet};
+use tracing::{error, info, info_span, Instrument};
+
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TraversingDepth, MAX_RETRIES};
+use utils::id::{TenantId, TenantTimelineId};
+
+/// Typical tenant to remove contains 1 layer and 1 index_part.json blobs
+/// Also, there are some non-standard tenants to remove, having more layers.
+/// delete_objects request allows up to 1000 keys, so be on a safe side and allow most
+/// batch processing tasks to do 1 delete objects request only.
+///
+/// Every batch item will be additionally S3 LS'ed later, so keep the batch size
+/// even lower to allow multiple concurrent tasks do the LS requests.
+const BATCH_SIZE: usize = 100;
+
+pub struct DeleteBatchProducer {
+    delete_tenants_sender_task: JoinHandle<anyhow::Result<ProcessedS3List<TenantId, ProjectData>>>,
+    delete_timelines_sender_task:
+        JoinHandle<anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>>>,
+    delete_batch_creator_task: JoinHandle<()>,
+    delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
+}
+
+pub struct DeleteProducerStats {
+    pub tenant_stats: ProcessedS3List<TenantId, ProjectData>,
+    pub timeline_stats: Option<ProcessedS3List<TenantTimelineId, BranchData>>,
+}
+
+impl DeleteProducerStats {
+    pub fn tenants_checked(&self) -> usize {
+        self.tenant_stats.entries_total
+    }
+
+    pub fn active_tenants(&self) -> usize {
+        self.tenant_stats.active_entries.len()
+    }
+
+    pub fn timelines_checked(&self) -> usize {
+        self.timeline_stats
+            .as_ref()
+            .map(|stats| stats.entries_total)
+            .unwrap_or(0)
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DeleteBatch {
+    pub tenants: Vec<TenantId>,
+    pub timelines: Vec<TenantTimelineId>,
+}
+
+impl DeleteBatch {
+    pub fn merge(&mut self, other: Self) {
+        self.tenants.extend(other.tenants);
+        self.timelines.extend(other.timelines);
+    }
+
+    pub fn len(&self) -> usize {
+        self.tenants.len() + self.timelines.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+impl DeleteBatchProducer {
+    pub fn start(
+        admin_client: Arc<CloudAdminApiClient>,
+        s3_client: Arc<Client>,
+        s3_root_target: RootTarget,
+        traversing_depth: TraversingDepth,
+    ) -> Self {
+        let (delete_elements_sender, mut delete_elements_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let delete_elements_sender = Arc::new(delete_elements_sender);
+        let admin_client = Arc::new(admin_client);
+
+        let (projects_to_check_sender, mut projects_to_check_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let delete_tenants_root_target = s3_root_target.clone();
+        let delete_tenants_client = Arc::clone(&s3_client);
+        let delete_tenants_admin_client = Arc::clone(&admin_client);
+        let delete_sender = Arc::clone(&delete_elements_sender);
+        let delete_tenants_sender_task = tokio::spawn(
+            async move {
+                tenant_batch::schedule_cleanup_deleted_tenants(
+                    &delete_tenants_root_target,
+                    &delete_tenants_client,
+                    &delete_tenants_admin_client,
+                    projects_to_check_sender,
+                    delete_sender,
+                    traversing_depth,
+                )
+                .await
+            }
+            .instrument(info_span!("delete_tenants_sender")),
+        );
+        let delete_timelines_sender_task = tokio::spawn(async move {
+            timeline_batch::schedule_cleanup_deleted_timelines(
+                &s3_root_target,
+                &s3_client,
+                &admin_client,
+                &mut projects_to_check_receiver,
+                delete_elements_sender,
+            )
+            .in_current_span()
+            .await
+        });
+
+        let (delete_batch_sender, delete_batch_receiver) = tokio::sync::mpsc::unbounded_channel();
+        let delete_batch_creator_task = tokio::spawn(
+            async move {
+                'outer: loop {
+                    let mut delete_batch = DeleteBatch::default();
+                    while delete_batch.len() < BATCH_SIZE {
+                        match delete_elements_receiver.recv().await {
+                            Some(new_task) => match new_task {
+                                Either::Left(tenant_id) => delete_batch.tenants.push(tenant_id),
+                                Either::Right(timeline_id) => {
+                                    delete_batch.timelines.push(timeline_id)
+                                }
+                            },
+                            None => {
+                                info!("Task finished: sender dropped");
+                                delete_batch_sender.send(delete_batch).ok();
+                                break 'outer;
+                            }
+                        }
+                    }
+
+                    if !delete_batch.is_empty() {
+                        delete_batch_sender.send(delete_batch).ok();
+                    }
+                }
+            }
+            .instrument(info_span!("delete batch creator")),
+        );
+
+        Self {
+            delete_tenants_sender_task,
+            delete_timelines_sender_task,
+            delete_batch_creator_task,
+            delete_batch_receiver: Arc::new(Mutex::new(delete_batch_receiver)),
+        }
+    }
+
+    pub fn subscribe(&self) -> Arc<Mutex<UnboundedReceiver<DeleteBatch>>> {
+        self.delete_batch_receiver.clone()
+    }
+
+    pub async fn join(self) -> anyhow::Result<DeleteProducerStats> {
+        let (delete_tenants_task_result, delete_timelines_task_result, batch_task_result) = tokio::join!(
+            self.delete_tenants_sender_task,
+            self.delete_timelines_sender_task,
+            self.delete_batch_creator_task,
+        );
+
+        let tenant_stats = match delete_tenants_task_result {
+            Ok(Ok(stats)) => stats,
+            Ok(Err(tenant_deletion_error)) => return Err(tenant_deletion_error),
+            Err(join_error) => {
+                anyhow::bail!("Failed to join the delete tenant producing task: {join_error}")
+            }
+        };
+
+        let timeline_stats = match delete_timelines_task_result {
+            Ok(Ok(stats)) => Some(stats),
+            Ok(Err(timeline_deletion_error)) => return Err(timeline_deletion_error),
+            Err(join_error) => {
+                anyhow::bail!("Failed to join the delete timeline producing task: {join_error}")
+            }
+        };
+
+        match batch_task_result {
+            Ok(()) => (),
+            Err(join_error) => anyhow::bail!("Failed to join the batch forming task: {join_error}"),
+        };
+
+        Ok(DeleteProducerStats {
+            tenant_stats,
+            timeline_stats,
+        })
+    }
+}
+
+pub struct ProcessedS3List<I, A> {
+    pub entries_total: usize,
+    pub entries_to_delete: Vec<I>,
+    pub active_entries: Vec<A>,
+}
+
+impl<I, A> Default for ProcessedS3List<I, A> {
+    fn default() -> Self {
+        Self {
+            entries_total: 0,
+            entries_to_delete: Vec::new(),
+            active_entries: Vec::new(),
+        }
+    }
+}
+
+impl<I, A> ProcessedS3List<I, A> {
+    fn merge(&mut self, other: Self) {
+        self.entries_total += other.entries_total;
+        self.entries_to_delete.extend(other.entries_to_delete);
+        self.active_entries.extend(other.active_entries);
+    }
+
+    fn change_ids<NewI>(self, transform: impl Fn(I) -> NewI) -> ProcessedS3List<NewI, A> {
+        ProcessedS3List {
+            entries_total: self.entries_total,
+            entries_to_delete: self.entries_to_delete.into_iter().map(transform).collect(),
+            active_entries: self.active_entries,
+        }
+    }
+}
+
+async fn process_s3_target_recursively<F, Fut, I, E, A>(
+    s3_client: &Client,
+    target: &S3Target,
+    find_active_and_deleted_entries: F,
+) -> anyhow::Result<ProcessedS3List<I, A>>
+where
+    I: FromStr<Err = E> + Send + Sync,
+    E: Send + Sync + std::error::Error + 'static,
+    F: FnOnce(Vec<I>) -> Fut + Clone,
+    Fut: Future<Output = anyhow::Result<ProcessedS3List<I, A>>>,
+{
+    let mut continuation_token = None;
+    let mut total_entries = ProcessedS3List::default();
+
+    loop {
+        let fetch_response =
+            list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&target.prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                entry_id_str
+                    .parse()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            })
+            .collect::<anyhow::Result<Vec<I>>>()
+            .context("list and parse bucket's entry ids")?;
+
+        total_entries.merge(
+            (find_active_and_deleted_entries.clone())(new_entry_ids)
+                .await
+                .context("filter active and deleted entry ids")?,
+        );
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    Ok(total_entries)
+}
+
+enum FetchResult<A> {
+    Found(A),
+    Deleted,
+    Absent,
+}
+
+async fn split_to_active_and_deleted_entries<I, A, F, Fut>(
+    new_entry_ids: Vec<I>,
+    find_active_entry: F,
+) -> anyhow::Result<ProcessedS3List<I, A>>
+where
+    I: std::fmt::Display + Send + Sync + 'static + Copy,
+    A: Send + 'static,
+    F: FnOnce(I) -> Fut + Send + Sync + 'static + Clone,
+    Fut: Future<Output = anyhow::Result<FetchResult<A>>> + Send,
+{
+    let entries_total = new_entry_ids.len();
+    let mut check_tasks = JoinSet::new();
+    let mut active_entries = Vec::with_capacity(entries_total);
+    let mut entries_to_delete = Vec::with_capacity(entries_total);
+
+    for new_entry_id in new_entry_ids {
+        let check_closure = find_active_entry.clone();
+        check_tasks.spawn(
+            async move {
+                (
+                    new_entry_id,
+                    async {
+                        for _ in 0..MAX_RETRIES {
+                            let closure_clone = check_closure.clone();
+                            match closure_clone(new_entry_id).await {
+                                Ok(active_entry) => return Ok(active_entry),
+                                Err(e) => {
+                                    error!("find active entry admin API call failed: {e}");
+                                    tokio::time::sleep(Duration::from_secs(1)).await;
+                                }
+                            }
+                        }
+
+                        anyhow::bail!("Failed to check entry {new_entry_id} {MAX_RETRIES} times")
+                    }
+                    .await,
+                )
+            }
+            .instrument(info_span!("filter_active_entries")),
+        );
+    }
+
+    while let Some(task_result) = check_tasks.join_next().await {
+        let (entry_id, entry_data_fetch_result) = task_result.context("task join")?;
+        match entry_data_fetch_result.context("entry data fetch")? {
+            FetchResult::Found(active_entry) => {
+                info!("Entry {entry_id} is alive, cannot delete");
+                active_entries.push(active_entry);
+            }
+            FetchResult::Deleted => {
+                info!("Entry {entry_id} deleted in the admin data, can safely delete");
+                entries_to_delete.push(entry_id);
+            }
+            FetchResult::Absent => {
+                info!("Entry {entry_id} absent in the admin data, can safely delete");
+                entries_to_delete.push(entry_id);
+            }
+        }
+    }
+    Ok(ProcessedS3List {
+        entries_total,
+        entries_to_delete,
+        active_entries,
+    })
+}
--- a/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
@@ -0,0 +1,87 @@
+use std::sync::Arc;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::UnboundedSender;
+use tracing::info;
+
+use crate::cloud_admin_api::{CloudAdminApiClient, ProjectData};
+use crate::delete_batch_producer::FetchResult;
+use crate::{RootTarget, TraversingDepth};
+use utils::id::{TenantId, TenantTimelineId};
+
+use super::ProcessedS3List;
+
+pub async fn schedule_cleanup_deleted_tenants(
+    s3_root_target: &RootTarget,
+    s3_client: &Arc<Client>,
+    admin_client: &Arc<CloudAdminApiClient>,
+    projects_to_check_sender: UnboundedSender<ProjectData>,
+    delete_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
+    traversing_depth: TraversingDepth,
+) -> anyhow::Result<ProcessedS3List<TenantId, ProjectData>> {
+    info!(
+        "Starting to list the bucket from root {}",
+        s3_root_target.bucket_name()
+    );
+    s3_client
+        .head_bucket()
+        .bucket(s3_root_target.bucket_name())
+        .send()
+        .await
+        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
+
+    let check_client = Arc::clone(admin_client);
+    let tenant_stats = super::process_s3_target_recursively(
+        s3_client,
+        s3_root_target.tenants_root(),
+        |s3_tenants| async move {
+            let another_client = Arc::clone(&check_client);
+            super::split_to_active_and_deleted_entries(s3_tenants, move |tenant_id| async move {
+                let project_data = another_client
+                    .find_tenant_project(tenant_id)
+                    .await
+                    .with_context(|| format!("Tenant {tenant_id} project admin check"))?;
+
+                Ok(if let Some(console_project) = project_data {
+                    if console_project.deleted {
+                        delete_sender.send(Either::Left(tenant_id)).ok();
+                        FetchResult::Deleted
+                    } else {
+                        if traversing_depth == TraversingDepth::Timeline {
+                            projects_to_check_sender.send(console_project.clone()).ok();
+                        }
+                        FetchResult::Found(console_project)
+                    }
+                } else {
+                    delete_sender.send(Either::Left(tenant_id)).ok();
+                    FetchResult::Absent
+                })
+            })
+            .await
+        },
+    )
+    .await
+    .context("tenant batch processing")?;
+
+    info!(
+        "Among {} tenants, found {} tenants to delete and {} active ones",
+        tenant_stats.entries_total,
+        tenant_stats.entries_to_delete.len(),
+        tenant_stats.active_entries.len(),
+    );
+
+    let tenant_stats = match traversing_depth {
+        TraversingDepth::Tenant => {
+            info!("Finished listing the bucket for tenants only");
+            tenant_stats
+        }
+        TraversingDepth::Timeline => {
+            info!("Finished listing the bucket for tenants and sent {} active tenants to check for timelines", tenant_stats.active_entries.len());
+            tenant_stats
+        }
+    };
+
+    Ok(tenant_stats)
+}
--- a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
+use tracing::{info, info_span, Instrument};
+
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
+use crate::delete_batch_producer::{FetchResult, ProcessedS3List};
+use crate::RootTarget;
+use utils::id::{TenantId, TenantTimelineId};
+
+pub async fn schedule_cleanup_deleted_timelines(
+    s3_root_target: &RootTarget,
+    s3_client: &Arc<Client>,
+    admin_client: &Arc<CloudAdminApiClient>,
+    projects_to_check_receiver: &mut UnboundedReceiver<ProjectData>,
+    delete_elements_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
+) -> anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>> {
+    info!(
+        "Starting to list the bucket from root {}",
+        s3_root_target.bucket_name()
+    );
+    s3_client
+        .head_bucket()
+        .bucket(s3_root_target.bucket_name())
+        .send()
+        .await
+        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
+
+    let mut timeline_stats = ProcessedS3List::default();
+    while let Some(project_to_check) = projects_to_check_receiver.recv().await {
+        let check_client = Arc::clone(admin_client);
+
+        let check_s3_client = Arc::clone(s3_client);
+
+        let check_delete_sender = Arc::clone(&delete_elements_sender);
+
+        let check_root = s3_root_target.clone();
+
+        let new_stats = async move {
+            let tenant_id_to_check = project_to_check.tenant;
+            let check_target = check_root.timelines_root(&tenant_id_to_check);
+            let stats = super::process_s3_target_recursively(
+                &check_s3_client,
+                &check_target,
+                |s3_timelines| async move {
+                    let another_client = check_client.clone();
+                    super::split_to_active_and_deleted_entries(
+                        s3_timelines,
+                        move |timeline_id| async move {
+                            let console_branch = another_client
+                                .find_timeline_branch(timeline_id)
+                                .await
+                                .map_err(|e| {
+                                    anyhow::anyhow!(
+                                        "Timeline {timeline_id} branch admin check: {e}"
+                                    )
+                                })?;
+
+                            let id = TenantTimelineId::new(tenant_id_to_check, timeline_id);
+                            Ok(match console_branch {
+                                Some(console_branch) => {
+                                    if console_branch.deleted {
+                                        check_delete_sender.send(Either::Right(id)).ok();
+                                        FetchResult::Deleted
+                                    } else {
+                                        FetchResult::Found(console_branch)
+                                    }
+                                }
+                                None => {
+                                    check_delete_sender.send(Either::Right(id)).ok();
+                                    FetchResult::Absent
+                                }
+                            })
+                        },
+                    )
+                    .await
+                },
+            )
+            .await
+            .with_context(|| format!("tenant {tenant_id_to_check} timeline batch processing"))?
+            .change_ids(|timeline_id| TenantTimelineId::new(tenant_id_to_check, timeline_id));
+
+            Ok::<_, anyhow::Error>(stats)
+        }
+        .instrument(info_span!("delete_timelines_sender", tenant = %project_to_check.tenant))
+        .await?;
+
+        timeline_stats.merge(new_stats);
+    }
+
+    info!(
+        "Among {} timelines, found {} timelines to delete and {} active ones",
+        timeline_stats.entries_total,
+        timeline_stats.entries_to_delete.len(),
+        timeline_stats.active_entries.len(),
+    );
+
+    Ok(timeline_stats)
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	ed3e3b6f61	pageserver: enable setting a target disk range	2023-10-25 14:39:12 +01:00
John Spray	098ef0956b	pageserver: publish disk eviction status	2023-10-25 14:35:32 +01:00