Add script to create safekeeper timelines

pageserver: helpers for explicitly dying on fatal I/O errors (#5651 )
Following from discussion on https://github.com/neondatabase/neon/pull/5436 where hacking an implicit die-on-fatal-io behavior into an Error type was a source of disagreement -- in this PR, dying on fatal I/O errors is explicit, with `fatal_err` and `maybe_fatal_err` helpers in the `MaybeFatalIo` trait, which is implemented for std::io::Result. To enable this approach with `crashsafe_overwrite`, the return type of that function is changed to std::io::Result -- the previous error enum for this function was not used for any logic, and the utility of saying exactly which step in the function failed is outweighed by the hygiene of having an I/O funciton return an io::Result. The initial use case for these helpers is the deletion queue.
2026-05-19 06:00:38 +00:00 · 2023-11-03 13:10:06 +00:00 · 2023-11-02 09:14:26 +00:00 · 2023-11-02 08:06:32 +00:00 · 2023-11-01 17:02:58 -04:00 · 2023-11-01 17:38:32 +02:00
116 changed files with 5377 additions and 5208 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -847,7 +847,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.2
+      VM_BUILDER_VERSION: v0.18.5

    steps:
      - name: Checkout
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 2'
+    - cron: '0 7 * * 5'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1609,16 +1609,6 @@ dependencies = [
 "subtle",
 ]

-[[package]]
-name = "ctor"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
-dependencies = [
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "ctr"
 version = "0.6.0"
@@ -2714,11 +2704,10 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.17"
+version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 dependencies = [
- "cfg-if",
 "value-bag",
 ]

@@ -6011,13 +6000,9 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

 [[package]]
 name = "value-bag"
-version = "1.0.0-alpha.9"
+version = "1.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
-dependencies = [
- "ctor",
- "version_check",
-]
+checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"

 [[package]]
 name = "vcpkg"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,6 +156,7 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
+                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -277,8 +278,9 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let vm_monitor_addr = matches
+                .get_one::<String>("vm-monitor-addr")
+                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
@@ -287,22 +289,16 @@ fn main() -> Result<()> {
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
+            let rt = if env::var_os("AUTOSCALING").is_some() {
+                Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
+                        .expect("failed to create tokio runtime for monitor")
+                )
+            } else {
+                None
            };

            // This token is used internally by the monitor to clean up all threads
@@ -313,7 +309,7 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
+                        addr: vm_monitor_addr.clone(),
                        file_cache_on_disk,
                    })),
                    token.clone(),
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .query(
+            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
+            &[],
+        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
+            replication: Some(row.get("rolreplication")),
+            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", jwt)
+        .header("Authorization", format!("Bearer {}", jwt))
        .send()
        .map_err(|e| {
            (
@@ -265,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+                || !r.bypassrls.unwrap_or(false)
+                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -296,7 +298,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
+                let mut query: String =
+                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -19,7 +19,7 @@ const COMMAND: &str = "attachment_service";
 pub struct AttachHookRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub pageserver_id: Option<NodeId>,
+    pub node_id: Option<NodeId>,
 }

 #[derive(Serialize, Deserialize)]
@@ -85,7 +85,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach_hook")
+            .join("attach-hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -93,7 +93,7 @@ impl AttachmentService {

        let request = AttachHookRequest {
            tenant_id,
-            pageserver_id: Some(pageserver_id),
+            node_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,6 +12,7 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
+use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};

@@ -171,7 +172,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                generation: state.generation,
+                gen: state.generation,
            });
        }
    }
@@ -217,14 +218,31 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.pageserver_id,
+            pageserver: attach_req.node_id,
            generation: 0,
        });

-    if attach_req.pageserver_id.is_some() {
+    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
        tenant_state.generation += 1;
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            ps_id = %attaching_pageserver,
+            generation = %tenant_state.generation,
+            "issuing",
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            %ps_id,
+            generation = %tenant_state.generation,
+            "dropping",
+        );
+    } else {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.pageserver_id;
+    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -232,7 +250,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.pageserver_id.map(|_| generation),
+            gen: attach_req.node_id.map(|_| generation),
        },
    )
 }
@@ -240,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", handle_re_attach)
-        .post("/validate", handle_validate)
-        .post("/attach_hook", handle_attach_hook)
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
 }

 #[tokio::main]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,6 +798,24 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
+        "reconfigure" => {
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
+                } else {
+                    None
+                };
+            endpoint.reconfigure(pageserver_id)?;
+        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1369,6 +1387,12 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
+                .subcommand(Command::new("reconfigure")
+                            .about("Reconfigure the endpoint")
+                            .arg(endpoint_pageserver_id_arg)
+                            .arg(endpoint_id_arg.clone())
+                            .arg(tenant_id_arg.clone())
+                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -414,18 +414,34 @@ impl Endpoint {
            );
        }

-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
+        Ok(())
+    }
+
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-
        Ok(())
    }

+    fn read_postgresql_conf(&self) -> Result<String> {
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => Ok(String::from_utf8(content)?),
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
+            Err(e) => Err(anyhow::Error::new(e).context(format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
+            ))),
+        }
+    }
+
    pub fn start(
        &self,
        auth_token: &Option<String>,
@@ -436,21 +452,7 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => String::from_utf8(content)?,
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-            Err(e) => {
-                return Err(anyhow::Error::new(e).context(format!(
-                    "failed to read config file in {}",
-                    postgresql_conf_path.to_str().unwrap()
-                )))
-            }
-        };
+        let postgresql_conf = self.read_postgresql_conf()?;

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -621,6 +623,61 @@ impl Endpoint {
        }
    }

+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+        let mut spec: ComputeSpec = {
+            let spec_path = self.endpoint_path().join("spec.json");
+            let file = std::fs::File::open(spec_path)?;
+            serde_json::from_reader(file)?
+        };
+
+        let postgresql_conf = self.read_postgresql_conf()?;
+        spec.cluster.postgresql_conf = Some(postgresql_conf);
+
+        if let Some(pageserver_id) = pageserver_id {
+            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
+            let mut endpoint_conf: EndpointConf = {
+                let file = std::fs::File::open(&endpoint_config_path)?;
+                serde_json::from_reader(file)?
+            };
+            endpoint_conf.pageserver_id = pageserver_id;
+            std::fs::write(
+                endpoint_config_path,
+                serde_json::to_string_pretty(&endpoint_conf)?,
+            )?;
+
+            let pageserver =
+                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+            let ps_http_conf = &pageserver.pg_connection_config;
+            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
+            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        }
+
+        let client = reqwest::blocking::Client::new();
+        let response = client
+            .post(format!(
+                "http://{}:{}/configure",
+                self.http_address.ip(),
+                self.http_address.port()
+            ))
+            .body(format!(
+                "{{\"spec\":{}}}",
+                serde_json::to_string_pretty(&spec)?
+            ))
+            .send()?;
+
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(())
+        } else {
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
+    }
+
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -629,15 +686,25 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
-            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
-        } else {
-            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -0,0 +1,108 @@
+# Updating Postgres
+
+## Minor Versions
+
+When upgrading to a new minor version of Postgres, please follow these steps:
+
+_Example: 15.4 is the new minor version to upgrade to from 15.3._
+
+1. Clone the Neon Postgres repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/postgres.git
+    ```
+
+1. Add the Postgres upstream remote.
+
+    ```shell
+    git remote add upstream https://git.postgresql.org/git/postgresql.git
+    ```
+
+1. Create a new branch based on the stable branch you are updating.
+
+    ```shell
+    git checkout -b my-branch REL_15_STABLE_neon
+    ```
+
+1. Tag the last commit on the stable branch you are updating.
+
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.
+
+    ```shell
+    git fetch upstream REL_15_4
+    git rebase REL_15_4
+    ```
+
+1. Run the Postgres test suite to make sure our commits have not affected
+Postgres in a negative way.
+
+    ```shell
+    make check
+    # OR
+    meson test -C builddir
+    ```
+
+1. Push your branch to the Neon Postgres repository.
+
+    ```shell
+    git push origin my-branch
+    ```
+
+1. Clone the Neon repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/neon.git
+    ```
+
+1. Create a new branch.
+
+1. Change the `revisions.json` file to point at the HEAD of your Postgres
+branch.
+
+1. Update the Git submodule.
+
+    ```shell
+    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule update --remote vendor/postgres-v15
+    ```
+
+1. Run the Neon test suite to make sure that Neon is still good to go on this
+minor Postgres release.
+
+    ```shell
+    ./scripts/poetry -k pg15
+    ```
+
+1. Commit your changes.
+
+1. Create a pull request, and wait for CI to go green.
+
+1. Force push the rebased Postgres branches into the Neon Postgres repository.
+
+    ```shell
+    git push --force origin my-branch:REL_15_STABLE_neon
+    ```
+
+    It may require disabling various branch protections.
+
+1. Update your Neon PR to point at the branches.
+
+    ```shell
+    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
+    git commit --amend --no-edit
+    git push --force origin
+    ```
+
+1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -190,6 +190,8 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
+    pub replication: Option<bool>,
+    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -89,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str) {
+pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision"]
+        &["revision", "build_tag"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision]).set(1);
+    metric.with_label_values(&[revision, build_tag]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -17,7 +17,7 @@ pub struct ReAttachRequest {
 pub struct ReAttachResponseTenant {
    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub generation: u32,
+    pub gen: u32,
 }

 #[derive(Serialize, Deserialize)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,7 +110,6 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
-            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -242,6 +242,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

+    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -393,13 +394,23 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S,
+        F: Fn() -> S + Clone,
        S: Future,
    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        // socket might be already closed, e.g. if previously received error,
-        // so ignore result.
-        self.framed.shutdown().await.ok();
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;
+
+        tokio::select! {
+            _ = shutdown_watcher() => {
+                // do nothing; we most likely got already stopped by shutdown and will log it next.
+            }
+            _ = self.framed.shutdown() => {
+                // socket might be already closed, e.g. if previously received error,
+                // so ignore result.
+            },
+        }
+
        match ret {
            Ok(()) => Ok(()),
            Err(QueryError::Shutdown) => {
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,6 +14,7 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
+            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,27 +214,24 @@ where
    }
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write(write_buf.chunk()).await?;
+        let bytes_written = stream.write_buf(write_buf).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
-        // The advanced part will be garbage collected, likely during shifting
-        // data left on next attempt to write to buffer when free space is not
-        // enough.
-        write_buf.advance(bytes_written);
    }
-    write_buf.clear();
    stream.flush().await
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,8 +23,8 @@ use tracing::debug;

 use crate::s3_bucket::RequestKind;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
-    StorageMetadata,
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
+    RemoteStorage, StorageMetadata,
 };

 pub struct AzureBlobStorage {
@@ -184,10 +184,11 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {

 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
-    async fn list_prefixes(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_name(p))
@@ -195,16 +196,19 @@ impl RemoteStorage for AzureBlobStorage {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut builder = self
-            .client
-            .list_blobs()
-            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+        let mut builder = self.client.list_blobs();
+
+        if let ListingMode::WithDelimiter = mode {
+            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+        }

        if let Some(prefix) = list_prefix {
            builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -215,46 +219,23 @@ impl RemoteStorage for AzureBlobStorage {
        }

        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(entry) = response.next().await {
-            let entry = entry.map_err(to_download_error)?;
-            let name_iter = entry
+        let mut res = Listing::default();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(to_download_error)?;
+            let prefix_iter = entry
                .blobs
                .prefixes()
                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.extend(name_iter);
-        }
-        Ok(res)
-    }
+            res.prefixes.extend(prefix_iter);

-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone());
-
-        let mut builder = self.client.list_blobs();
-
-        if let Some(folder_name) = folder_name {
-            builder = builder.prefix(Cow::from(folder_name.to_owned()));
-        }
-
-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
-
-        let mut response = builder.into_stream();
-        let mut res = Vec::new();
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(anyhow::Error::new)?;
-            let name_iter = entry
+            let blob_iter = entry
                .blobs
                .blobs()
-                .map(|bl| self.name_to_relative_path(&bl.name));
-            res.extend(name_iter);
+                .map(|k| self.name_to_relative_path(&k.name));
+            res.keys.extend(blob_iter);
        }
        Ok(res)
    }
-
    async fn upload(
        &self,
        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -129,6 +129,22 @@ impl RemotePath {
    }
 }

+/// We don't need callers to be able to pass arbitrary delimiters: just control
+/// whether listings will use a '/' separator or not.
+///
+/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
+/// NoDelimiter mode will only populate `keys`.
+pub enum ListingMode {
+    WithDelimiter,
+    NoDelimiter,
+}
+
+#[derive(Default)]
+pub struct Listing {
+    pub prefixes: Vec<RemotePath>,
+    pub keys: Vec<RemotePath>,
+}
+
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -141,8 +157,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError>;
-
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -154,7 +175,16 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
+        Ok(result)
+    }
+
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        _mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -205,6 +235,9 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -215,6 +248,7 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -234,6 +268,19 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    pub async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list(prefix, mode).await,
+            Self::AwsS3(s) => s.list(prefix, mode).await,
+            Self::AzureBlob(s) => s.list(prefix, mode).await,
+            Self::Unreliable(s) => s.list(prefix, mode).await,
+        }
+    }
+
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, RemotePath};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -75,7 +75,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,52 +89,10 @@ impl LocalFs {
            })
            .collect())
    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            prefixes.push(
-                prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    ),
-            )
-        }
-
-        Ok(prefixes)
-    }

    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -186,6 +144,70 @@ impl RemoteStorage for LocalFs {

        Ok(files)
    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        let mut result = Listing::default();
+
+        if let ListingMode::NoDelimiter = mode {
+            let keys = self
+                .list_recursive(prefix)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            result.keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
+
+            return Ok(result);
+        }
+
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            let stripped = prefix
+                .strip_prefix(&self.storage_root)
+                .context("Failed to strip prefix")
+                .and_then(RemotePath::new)
+                .expect(
+                    "We list files for storage root, hence should be able to remote the prefix",
+                );
+
+            if prefix.is_dir() {
+                result.prefixes.push(stripped);
+            } else {
+                result.keys.push(stripped);
+            }
+        }
+
+        Ok(result)
+    }

    async fn upload(
        &self,
@@ -479,7 +501,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list().await?,
+            storage.list_all().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -667,7 +689,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list().await?.is_empty());
+        assert!(storage.list_all().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -725,6 +747,43 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn list() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let storage = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        assert!(listing.prefixes.is_empty());
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+
+        // Delimiter: should only go one deep
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines").unwrap()].to_vec()
+        );
+        assert!(listing.keys.is_empty());
+
+        // Delimiter & prefix
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
+                ListingMode::WithDelimiter,
+            )
+            .await?;
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
+        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -777,7 +836,7 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list().await?;
+        let mut files = storage.list_all().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,8 +30,8 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -299,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    /// See the doc for `RemoteStorage::list_prefixes`
-    /// Note: it wont include empty "directories"
-    async fn list_prefixes(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
+        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -314,28 +314,33 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut document_keys = Vec::new();
-
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let fetch_response = self
+            let mut request = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
-                .set_max_keys(self.max_keys_per_list_response)
+                .set_max_keys(self.max_keys_per_list_response);
+
+            if let ListingMode::WithDelimiter = mode {
+                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            let response = request
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -345,71 +350,35 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
+                .observe_elapsed(kind, &response, started_at);

-            let fetch_response = fetch_response?;
+            let response = response?;

-            document_keys.extend(
-                fetch_response
-                    .common_prefixes
-                    .unwrap_or_default()
-                    .into_iter()
+            let keys = response.contents().unwrap_or_default();
+            let empty = Vec::new();
+            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+
+            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+
+            for object in keys {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                result.keys.push(remote_path);
+            }
+
+            result.prefixes.extend(
+                prefixes
+                    .iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
+            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(document_keys)
-    }
-
-    /// See the doc for `RemoteStorage::list_files`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
-        let folder_name = folder
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
-
-        // AWS may need to break the response into several parts
-        let mut continuation_token = None;
-        let mut all_files = vec![];
-        loop {
-            let _guard = self.permit(kind).await;
-            let started_at = start_measuring_requests(kind);
-
-            let response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(folder_name.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response)
-                .send()
-                .await
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
-
-            for object in response.contents().unwrap_or_default() {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                all_files.push(remote_path);
-            }
-            match response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-        Ok(all_files)
+        Ok(result)
    }

    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,7 +5,9 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -95,6 +97,15 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list(prefix, mode).await
+    }
+
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,6 +73,8 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+pub mod sync;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -128,6 +130,21 @@ macro_rules! project_git_version {
    };
 }

+/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
+#[macro_export]
+macro_rules! project_build_tag {
+    ($const_identifier:ident) => {
+        const $const_identifier: &::core::primitive::str = {
+            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
+                ::core::option::Option::Some(x) => ["build_tag-env:", x],
+                ::core::option::Option::None => ["build_tag:", ""],
+            };
+
+            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
+        };
+    };
+}
+
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -0,0 +1 @@
+pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -0,0 +1,350 @@
+use std::sync::{Arc, Mutex, MutexGuard};
+use tokio::sync::Semaphore;
+
+/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
+/// for the duration of initialization.
+///
+/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
+///
+/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
+pub struct OnceCell<T> {
+    inner: Mutex<Inner<T>>,
+}
+
+impl<T> Default for OnceCell<T> {
+    /// Create new uninitialized [`OnceCell`].
+    fn default() -> Self {
+        Self {
+            inner: Default::default(),
+        }
+    }
+}
+
+/// Semaphore is the current state:
+/// - open semaphore means the value is `None`, not yet initialized
+/// - closed semaphore means the value has been initialized
+#[derive(Debug)]
+struct Inner<T> {
+    init_semaphore: Arc<Semaphore>,
+    value: Option<T>,
+}
+
+impl<T> Default for Inner<T> {
+    fn default() -> Self {
+        Self {
+            init_semaphore: Arc::new(Semaphore::new(1)),
+            value: None,
+        }
+    }
+}
+
+impl<T> OnceCell<T> {
+    /// Creates an already initialized `OnceCell` with the given value.
+    pub fn new(value: T) -> Self {
+        let sem = Semaphore::new(1);
+        sem.close();
+        Self {
+            inner: Mutex::new(Inner {
+                init_semaphore: Arc::new(sem),
+                value: Some(value),
+            }),
+        }
+    }
+
+    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
+    /// returning the guard.
+    ///
+    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
+    ///
+    /// Initialization is panic-safe and cancellation-safe.
+    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
+    where
+        F: FnOnce(InitPermit) -> Fut,
+        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
+    {
+        let sem = {
+            let guard = self.inner.lock().unwrap();
+            if guard.value.is_some() {
+                return Ok(Guard(guard));
+            }
+            guard.init_semaphore.clone()
+        };
+
+        let permit = sem.acquire_owned().await;
+
+        match permit {
+            Ok(permit) => {
+                let permit = InitPermit(permit);
+                let (value, _permit) = factory(permit).await?;
+
+                let guard = self.inner.lock().unwrap();
+
+                Ok(Self::set0(value, guard))
+            }
+            Err(_closed) => {
+                let guard = self.inner.lock().unwrap();
+                assert!(
+                    guard.value.is_some(),
+                    "semaphore got closed, must be initialized"
+                );
+                return Ok(Guard(guard));
+            }
+        }
+    }
+
+    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
+    /// to complete initializing the inner value.
+    ///
+    /// # Panics
+    ///
+    /// If the inner has already been initialized.
+    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
+        // cannot assert that this permit is for self.inner.semaphore
+        let guard = self.inner.lock().unwrap();
+
+        if guard.init_semaphore.try_acquire().is_ok() {
+            drop(guard);
+            panic!("semaphore is of wrong origin");
+        }
+
+        Self::set0(value, guard)
+    }
+
+    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
+        if guard.value.is_some() {
+            drop(guard);
+            unreachable!("we won permit, must not be initialized");
+        }
+        guard.value = Some(value);
+        guard.init_semaphore.close();
+        Guard(guard)
+    }
+
+    /// Returns a guard to an existing initialized value, if any.
+    pub fn get(&self) -> Option<Guard<'_, T>> {
+        let guard = self.inner.lock().unwrap();
+        if guard.value.is_some() {
+            Some(Guard(guard))
+        } else {
+            None
+        }
+    }
+}
+
+/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
+/// initialized value.
+#[derive(Debug)]
+pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
+
+impl<T> std::ops::Deref for Guard<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.0
+            .value
+            .as_ref()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> std::ops::DerefMut for Guard<'_, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.0
+            .value
+            .as_mut()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<'a, T> Guard<'a, T> {
+    /// Take the current value, and a new permit for it's deinitialization.
+    ///
+    /// The permit will be on a semaphore part of the new internal value, and any following
+    /// [`OnceCell::get_or_init`] will wait on it to complete.
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
+        let mut swapped = Inner::default();
+        let permit = swapped
+            .init_semaphore
+            .clone()
+            .try_acquire_owned()
+            .expect("we just created this");
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, InitPermit(permit)))
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+/// Type held by OnceCell (de)initializing task.
+pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::{
+        convert::Infallible,
+        sync::atomic::{AtomicUsize, Ordering},
+        time::Duration,
+    };
+
+    #[tokio::test]
+    async fn many_initializers() {
+        #[derive(Default, Debug)]
+        struct Counters {
+            factory_got_to_run: AtomicUsize,
+            future_polled: AtomicUsize,
+            winners: AtomicUsize,
+        }
+
+        let initializers = 100;
+
+        let cell = Arc::new(OnceCell::default());
+        let counters = Arc::new(Counters::default());
+        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
+
+        let mut js = tokio::task::JoinSet::new();
+        for i in 0..initializers {
+            js.spawn({
+                let cell = cell.clone();
+                let counters = counters.clone();
+                let barrier = barrier.clone();
+
+                async move {
+                    barrier.wait().await;
+                    let won = {
+                        let g = cell
+                            .get_or_init(|permit| {
+                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
+                                async {
+                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
+                                    Ok::<_, Infallible>((i, permit))
+                                }
+                            })
+                            .await
+                            .unwrap();
+
+                        *g == i
+                    };
+
+                    if won {
+                        counters.winners.fetch_add(1, Ordering::Relaxed);
+                    }
+                }
+            });
+        }
+
+        barrier.wait().await;
+
+        while let Some(next) = js.join_next().await {
+            next.expect("no panics expected");
+        }
+
+        let mut counters = Arc::try_unwrap(counters).unwrap();
+
+        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
+        assert_eq!(*counters.future_polled.get_mut(), 1);
+        assert_eq!(*counters.winners.get_mut(), 1);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn reinit_waits_for_deinit() {
+        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
+        let sleep_for = Duration::from_secs(1);
+        let initial = 42;
+        let reinit = 1;
+        let cell = Arc::new(OnceCell::new(initial));
+
+        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
+
+        let jh = tokio::spawn({
+            let cell = cell.clone();
+            let deinitialization_started = deinitialization_started.clone();
+            async move {
+                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
+                assert_eq!(answer, initial);
+
+                deinitialization_started.wait().await;
+                tokio::time::sleep(sleep_for).await;
+            }
+        });
+
+        deinitialization_started.wait().await;
+
+        let started_at = tokio::time::Instant::now();
+        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
+            .await
+            .unwrap();
+
+        let elapsed = started_at.elapsed();
+        assert!(
+            elapsed >= sleep_for,
+            "initialization should had taken at least the time time slept with permit"
+        );
+
+        jh.await.unwrap();
+
+        assert_eq!(*cell.get().unwrap(), reinit);
+    }
+
+    #[test]
+    fn reinit_with_deinit_permit() {
+        let cell = Arc::new(OnceCell::new(42));
+
+        let (mol, permit) = cell.get().unwrap().take_and_deinit();
+        cell.set(5, permit);
+        assert_eq!(*cell.get().unwrap(), 5);
+
+        let (five, permit) = cell.get().unwrap().take_and_deinit();
+        assert_eq!(5, five);
+        cell.set(mol, permit);
+        assert_eq!(*cell.get().unwrap(), 42);
+    }
+
+    #[tokio::test]
+    async fn initialization_attemptable_until_ok() {
+        let cell = OnceCell::default();
+
+        for _ in 0..10 {
+            cell.get_or_init(|_permit| async { Err("whatever error") })
+                .await
+                .unwrap_err();
+        }
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "finally success");
+    }
+
+    #[tokio::test]
+    async fn initialization_is_cancellation_safe() {
+        let cell = OnceCell::default();
+
+        let barrier = tokio::sync::Barrier::new(2);
+
+        let initializer = cell.get_or_init(|permit| async {
+            barrier.wait().await;
+            futures::future::pending::<()>().await;
+
+            Ok::<_, Infallible>(("never reached", permit))
+        });
+
+        tokio::select! {
+            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
+            _ = barrier.wait() => {}
+        };
+
+        // now initializer is dropped
+
+        assert!(cell.get().is_none());
+
+        let g = cell
+            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
+            .await
+            .unwrap();
+        assert_eq!(*g, "now initialized");
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,11 +34,12 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
-    tcp_listener,
+    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
+    signals::Signal, tcp_listener,
 };

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 const PID_FILE_NAME: &str = "pageserver.pid";

@@ -258,11 +259,12 @@ fn start_pageserver(
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {}",
+        "version: {} launch_timestamp: {} build_tag: {}",
        version(),
-        launch_ts.to_string()
+        launch_ts.to_string(),
+        BUILD_TAG,
    );
-    set_build_info_metric(GIT_VERSION);
+    set_build_info_metric(GIT_VERSION, BUILD_TAG);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,8 +33,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
-    TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -633,11 +632,6 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }

-    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
-            .join(TENANT_ATTACHING_MARKER_FILENAME)
-    }
-
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,7 +57,10 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            headers.insert(
+                "Authorization",
+                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
+            );
            client = client.default_headers(headers);
        }

@@ -144,7 +147,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.generation)))
+            .map(|t| (t.id, Generation::new(t.gen)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -271,7 +272,9 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .map_err(Into::into)
+            .maybe_fatal_err("save deletion header")?;
+
+        Ok(())
    }
 }

@@ -360,6 +363,7 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
+            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::virtual_file::on_fatal_io_error;
+use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    Err(anyhow::anyhow!(e))
+                    on_fatal_io_error(&e, "reading deletion header");
                }
            }
        }
@@ -216,16 +218,9 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
+        let mut dir = tokio::fs::read_dir(&deletion_directory)
+            .await
+            .fatal_err("read deletion directory");

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -233,7 +228,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
+        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -246,11 +241,9 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
-                    // Non-fatal error: we will just leave the file behind but not
-                    // try and load it.
-                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
-                }
+                tokio::fs::remove_file(&absolute_path)
+                    .await
+                    .fatal_err("delete temp file");

                continue;
            }
@@ -290,7 +283,9 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path).await?;
+            let list_bytes = tokio::fs::read(&list_path)
+                .await
+                .fatal_err("read deletion list");

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -349,7 +344,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");

        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
+use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -287,16 +288,9 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files.  We will leave the file behind.  Subsequent
-                // pageservers will try and load it again: hopefully whatever storage
-                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {list_path}: {e:#}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
-                break;
-            }
+            tokio::fs::remove_file(&list_path)
+                .await
+                .fatal_err("remove deletion list");
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{
+        self,
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        Timeline,
+    },
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: GenericRemoteStorage,
+    _storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res = disk_usage_eviction_task_iteration(
-                state,
-                task_config,
-                &storage,
-                tenants_dir,
-                &cancel,
-            )
-            .await;
+            let res =
+                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;

            match res {
                Ok(()) => {}
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -273,7 +270,6 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
-    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
+    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -349,10 +346,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        batched
-            .entry(TimelineKey(candidate.timeline))
-            .or_default()
-            .push(candidate.layer);
+        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
+        // tasks to evict all seen layers until we have evicted enough
+
+        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
+
+        // semaphore will later be used to limit eviction concurrency, and we can express at
+        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
+        // but fail gracefully by not making batches larger.
+        if batch.len() < u32::MAX as usize {
+            batch.push(candidate.layer);
+            max_batch_size = max_batch_size.max(batch.len());
+        }
    }

    let usage_planned = match warned {
@@ -369,69 +374,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    // After the loop, `usage_assumed` is the post-eviction usage,
-    // according to internal accounting.
-    let mut usage_assumed = usage_pre;
-    let mut evictions_failed = LayerCount::default();
+    let mut js = tokio::task::JoinSet::new();
+
+    // ratelimit to 1k files or any higher max batch size
+    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size = batch.len();
+        let batch_size =
+            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+
+        // I dislike naming of `available_permits` but it means current total amount of permits
+        // because permits can be added
+        assert!(batch_size as usize <= limit.available_permits());

        debug!(%timeline_id, "evicting batch for timeline");

-        async {
-            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+        let evict = {
+            let limit = limit.clone();
+            let cancel = cancel.clone();
+            async move {
+                let mut evicted_bytes = 0;
+                let mut evictions_failed = LayerCount::default();

-            match results {
-                Err(e) => {
-                    warn!("failed to evict batch: {:#}", e);
-                }
-                Ok(results) => {
-                    assert_eq!(results.len(), batch.len());
-                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
-                        match result {
-                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
-                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                                warn!(%layer, "failed to evict layer: {detail}");
-                                evictions_failed.file_sizes += file_size;
-                                evictions_failed.count += 1;
-                            }
-                            None => {
-                                assert!(cancel.is_cancelled());
-                                return;
+                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
+                    // semaphore closing means cancelled
+                    return (evicted_bytes, evictions_failed);
+                };
+
+                let results = timeline.evict_layers(&batch, &cancel).await;
+
+                match results {
+                    Ok(results) => {
+                        assert_eq!(results.len(), batch.len());
+                        for (result, layer) in results.into_iter().zip(batch.iter()) {
+                            let file_size = layer.layer_desc().file_size;
+                            match result {
+                                Some(Ok(())) => {
+                                    evicted_bytes += file_size;
+                                }
+                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                                    evictions_failed.file_sizes += file_size;
+                                    evictions_failed.count += 1;
+                                }
+                                None => {
+                                    assert!(cancel.is_cancelled());
+                                }
                            }
                        }
                    }
+                    Err(e) => {
+                        warn!("failed to evict batch: {:#}", e);
+                    }
                }
+                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
-        .await;
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        if cancel.is_cancelled() {
+        js.spawn(evict);
+
+        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
+        // chance of making progress
+        tokio::task::yield_now().await;
+    }
+
+    let join_all = async move {
+        // After the evictions, `usage_assumed` is the post-eviction usage,
+        // according to internal accounting.
+        let mut usage_assumed = usage_pre;
+        let mut evictions_failed = LayerCount::default();
+
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok((evicted_bytes, failed)) => {
+                    usage_assumed.add_available_bytes(evicted_bytes);
+                    evictions_failed.file_sizes += failed.file_sizes;
+                    evictions_failed.count += failed.count;
+                }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => { /* already logged */ }
+                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            }
+        }
+        (usage_assumed, evictions_failed)
+    };
+
+    let (usage_assumed, evictions_failed) = tokio::select! {
+        tuple = join_all => { tuple },
+        _ = cancel.cancelled() => {
+            // close the semaphore to stop any pending acquires
+            limit.close();
            return Ok(IterationOutcome::Cancelled);
        }
-    }
+    };

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -446,7 +483,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Arc<dyn PersistentLayer>,
+    layer: Layer,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -569,7 +569,17 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: Tenant download is already in progress
+          description: |
+            The tenant is already known to Pageserver in some way,
+            and hence this `/attach` call has been rejected.
+
+            Some examples of how this can happen:
+            - tenant was created on this pageserver
+            - tenant attachment was started by an earlier call to `/attach`.
+
+            Callers should poll the tenant status's `attachment_status` field,
+            like for status 202. See the longer description for `POST /attach`
+            for details.
          content:
            application/json:
              schema:
@@ -713,6 +723,12 @@ paths:

        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1203,6 +1219,15 @@ components:
            new_tenant_id:
              type: string
              format: hex
+            generation:
+              type: integer
+              description: Attachment generation number.
+    TenantLoadRequest:
+      type: object
+      properties:
+        generation:
+          type: integer
+          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1205,7 +1205,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1230,7 +1230,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, ())
    }
@@ -1500,11 +1500,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    let Some(storage) = state.remote_storage.clone() else {
+    if state.remote_storage.as_ref().is_none() {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    };
+    }

    let state = state.disk_usage_eviction_state.clone();

@@ -1522,7 +1522,6 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
-                &storage,
                usage,
                &child_cancel,
            )
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1404,7 +1404,7 @@ impl TimelineMetrics {
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub fn resident_physical_size_get(&self) -> u64 {
+    pub(crate) fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -552,7 +552,8 @@ impl Timeline {
                Err(e) => Err(PageReconstructError::from(e)),
            },
            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
                Ok(HashMap::new())
            }
        }
@@ -1202,7 +1203,8 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
+                // This is expected: historical databases do not have the key.
+                debug!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, instrument, warn, Instrument, Span};
+use tracing::{error, instrument, warn, Instrument, Span};

 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -25,11 +25,9 @@ use super::{
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant,
+    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
 };

-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
@@ -60,7 +58,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
 }

 async fn create_remote_delete_mark(
@@ -150,7 +148,8 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
-        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
+        let list = &list.into_iter().take(10).collect::<Vec<_>>();
        return Err(DeleteTenantError::Other(anyhow::anyhow!(
            "Timelines directory is not empty after all timelines deletion: {list:?}"
        )));
@@ -239,32 +238,6 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

-pub(crate) async fn remote_delete_mark_exists(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-    remote_storage: &GenericRemoteStorage,
-) -> anyhow::Result<bool> {
-    // If remote storage is there we rely on it
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
-
-    let result = backoff::retry(
-        || async { remote_storage.download(&remote_mark_path).await },
-        |e| matches!(e, DownloadError::NotFound),
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        "fetch_tenant_deletion_mark",
-        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
-    )
-    .await;
-
-    match result {
-        Ok(_) => Ok(true),
-        Err(DownloadError::NotFound) => Ok(false),
-        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
-    }
-}
-
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -276,10 +249,9 @@ pub(crate) async fn remote_delete_mark_exists(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
-/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -378,7 +350,7 @@ impl DeleteTenantFlow {

    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_mark_exists: bool,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -389,66 +361,25 @@ impl DeleteTenantFlow {
            )
        };

-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+        if remote_mark_exists {
            return Ok(acquire(tenant));
        }

-        let remote_storage = match remote_storage {
-            Some(remote_storage) => remote_storage,
-            None => return Ok(None),
-        };
-
-        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
            Ok(acquire(tenant))
        } else {
            Ok(None)
        }
    }

-    pub(crate) async fn resume_from_load(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        init_order: Option<&InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
-
-        tenant
-            .set_stopping(progress, true, false)
-            .await
-            .expect("cant be stopping or broken");
-
-        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
-        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
-        if let Some(background) = background_jobs_can_start {
-            info!("waiting for backgound jobs barrier");
-            background.clone().wait().await;
-            info!("ready for backgound jobs barrier");
-        }
-
-        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
-        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
-        if timelines_path.exists() {
-            tenant.load(init_order, None, ctx).await.context("load")?;
-        }
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
+        preload: Option<TenantPreload>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -459,7 +390,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(ctx, super::AttachMarkerMode::Expect)
+            .attach(init_order, preload, ctx)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,147 +639,10 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for layer in self.iter_historic_layers() {
-            layer.dump(verbose, ctx)?;
+        for desc in self.iter_historic_layers() {
+            desc.dump();
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::LayerMap;
-    use crate::tenant::storage_layer::LayerFileName;
-    use std::str::FromStr;
-    use std::sync::Arc;
-
-    mod l0_delta_layers_updated {
-
-        use crate::tenant::{
-            storage_layer::{AsLayerDesc, PersistentLayerDesc},
-            timeline::layer_manager::LayerFileManager,
-        };
-
-        use super::*;
-
-        struct LayerObject(PersistentLayerDesc);
-
-        impl AsLayerDesc for LayerObject {
-            fn layer_desc(&self) -> &PersistentLayerDesc {
-                &self.0
-            }
-        }
-
-        impl LayerObject {
-            fn new(desc: PersistentLayerDesc) -> Self {
-                LayerObject(desc)
-            }
-        }
-
-        type TestLayerFileManager = LayerFileManager<LayerObject>;
-
-        #[test]
-        fn for_full_range_delta() {
-            // l0_delta_layers are used by compaction, and should observe all buffered updates
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
-                 true
-             )
-        }
-
-        #[test]
-        fn for_non_full_range_delta() {
-            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
-                 // because not full range
-                 false
-             )
-        }
-
-        #[test]
-        fn for_image() {
-            l0_delta_layers_updated_scenario(
-                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
-                 // code only checks if it is a full range layer, doesn't care about images, which must
-                 // mean we should in practice never have full range images
-                 false
-             )
-        }
-
-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = PersistentLayerDesc::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(LayerObject::new(layer.clone()));
-            let new_version = Arc::new(LayerObject::new(layer));
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = TestLayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
-        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
-            let name = LayerFileName::from_str(layer_name).unwrap();
-            let skeleton = PersistentLayerDesc::from(name);
-
-            let remote = Arc::new(LayerObject::new(skeleton.clone()));
-            let downloaded = Arc::new(LayerObject::new(skeleton));
-
-            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();
-
-            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
-            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
-            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
-
-            let expected_in_counts = (1, usize::from(expected_l0));
-
-            map.batch_update()
-                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
-            assert_eq!(
-                count_layer_in(&map, remote.layer_desc()),
-                expected_in_counts
-            );
-
-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
-            map.batch_update().remove_historic(downloaded.layer_desc());
-            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
-        }
-
-        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
-            let historic = map
-                .iter_historic_layers()
-                .filter(|x| x.key() == layer.key())
-                .count();
-            let l0s = map
-                .get_level0_deltas()
-                .expect("why does this return a result");
-            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
-
-            (historic, l0)
-        }
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,10 +26,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{
-    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
-    TenantState,
-};
+use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -437,14 +434,15 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        match schedule_local_tenant_processing(
+        match tenant_spawn(
            conf,
            tenant_id,
            &tenant_dir_path,
-            AttachedTenantConf::try_from(location_conf)?,
            resources.clone(),
+            AttachedTenantConf::try_from(location_conf)?,
            Some(init_order.clone()),
            &TENANTS,
+            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -464,15 +462,18 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

+/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
+/// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn schedule_local_tenant_processing(
+pub(crate) fn tenant_spawn(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_path: &Utf8Path,
-    location_conf: AttachedTenantConf,
    resources: TenantSharedResources,
+    location_conf: AttachedTenantConf,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
+    mode: SpawnMode,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -496,45 +497,24 @@ pub(crate) fn schedule_local_tenant_processing(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
-        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if resources.remote_storage.is_none() {
-            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(
-                conf,
-                tenant_id,
-                "attaching mark file present but no remote storage configured".to_string(),
-            )
-        } else {
-            match Tenant::spawn_attach(
-                conf,
-                tenant_id,
-                resources,
-                location_conf,
-                tenants,
-                AttachMarkerMode::Expect,
-                ctx,
-            ) {
-                Ok(tenant) => tenant,
-                Err(e) => {
-                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                }
-            }
+    info!("Attaching tenant {tenant_id}");
+    let tenant = match Tenant::spawn(
+        conf,
+        tenant_id,
+        resources,
+        location_conf,
+        init_order,
+        tenants,
+        mode,
+        ctx,
+    ) {
+        Ok(tenant) => tenant,
+        Err(e) => {
+            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
+            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
        }
-    } else {
-        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
-        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(
-            conf,
-            tenant_id,
-            location_conf,
-            resources,
-            init_order,
-            tenants,
-            ctx,
-        )
    };
+
    Ok(tenant)
 }

@@ -670,29 +650,41 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
-
        let location_conf = LocationConf::attached_single(tenant_conf, generation);

        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
+        super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
+
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let created_tenant =
-            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
-                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+        let tenant_path = conf.tenant_path(&tenant_id);
+
+        let created_tenant = tenant_spawn(
+            conf,
+            tenant_id,
+            &tenant_path,
+            resources,
+            AttachedTenantConf::try_from(location_conf)?,
+            None,
+            &TENANTS,
+            SpawnMode::Create,
+            ctx,
+        )?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

        let crated_tenant_id = created_tenant.tenant_id();
        anyhow::ensure!(
-                tenant_id == crated_tenant_id,
-                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
-            );
+            tenant_id == crated_tenant_id,
+            "loaded created tenant has unexpected tenant id \
+                (expect {tenant_id} != actual {crated_tenant_id})",
+        );
        Ok(created_tenant)
-    }).await
+    })
+    .await
 }

 #[derive(Debug, thiserror::Error)]
@@ -801,9 +793,10 @@ pub(crate) async fn upsert_location(
                }
            }

+            let tenant_path = conf.tenant_path(&tenant_id);
+
            let new_slot = match &new_location_config.mode {
                LocationMode::Secondary(_) => {
-                    let tenant_path = conf.tenant_path(&tenant_id);
                    // Directory doesn't need to be fsync'd because if we crash it can
                    // safely be recreated next time this tenant location is configured.
                    unsafe_create_dir_all(&tenant_path)
@@ -833,28 +826,21 @@ pub(crate) async fn upsert_location(
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;

-                    let tenant = match Tenant::spawn_attach(
+                    let tenant = tenant_spawn(
                        conf,
                        tenant_id,
+                        &tenant_path,
                        TenantSharedResources {
                            broker_client,
                            remote_storage,
                            deletion_queue_client,
                        },
                        AttachedTenantConf::try_from(new_location_config)?,
+                        None,
                        &TENANTS,
-                        // The LocationConf API does not use marker files, because we have Secondary
-                        // locations where the directory's existence is not a signal that it contains
-                        // all timelines.  See https://github.com/neondatabase/neon/issues/5550
-                        AttachMarkerMode::Ignore,
+                        SpawnMode::Normal,
                        ctx,
-                    ) {
-                        Ok(tenant) => tenant,
-                        Err(e) => {
-                            error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                        }
-                    };
+                    )?;

                    TenantSlot::Attached(tenant)
                }
@@ -1043,7 +1029,7 @@ pub(crate) async fn load_tenant(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None,  &TENANTS, ctx)
+        let new_tenant = tenant_spawn(conf, tenant_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, None,  &TENANTS, SpawnMode::Normal, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -1117,18 +1103,12 @@ pub(crate) async fn attach_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
+        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
-        let marker_file_exists = conf
-            .tenant_attaching_mark_file_path(&tenant_id)
-            .try_exists()
-            .context("check for attach marker file existence")?;
-        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
-
-        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
+        let attached_tenant = tenant_spawn(conf, tenant_id, &tenant_dir,
+            resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,8 +57,7 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
-/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+/// Parallel fsync asynchronously.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,39 +167,15 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
-//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
-//!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
-//! We keep track of the fact that a client is in `Attaching` state in a marker
-//! file on the local disk. This is critical because, when we restart the pageserver,
-//! we do not want to do the `List timelines` step for each tenant that has already
-//! been successfully attached (for performance & cost reasons).
-//! Instead, for a tenant without the attach marker file, we assume that the
-//! local state is in sync or ahead of the remote state. This includes the list
-//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
-//! if there's a timeline on the remote that the pageserver doesn't know about,
-//! the GC will not consider its branch point, leading to data loss.
-//! So, for a tenant with the attach marker file, we know that we do not yet have
-//! persisted all the remote timeline's metadata files locally. To exclude the
-//! risk above, we re-run the procedure for such tenants
-//!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
-//! Theoretically, it should be ok to remove and re-add remote storage configuration to
-//! the pageserver config at any time, since it doesn't make a difference to
-//! [`Timeline::load_layer_map`].
-//! Of course, the remote timeline dir must not change while we have de-configured
-//! remote storage, i.e., the pageserver must remain the owner of the given prefix
-//! in remote storage.
-//! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -211,8 +187,7 @@ mod upload;
 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-// re-export these
-pub use download::{is_temp_download_file, list_remote_timelines};
+
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -237,7 +212,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -255,10 +230,13 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::LayerFileName;
+use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
+pub(crate) use index::LayerFileMetadata;
+
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -468,7 +446,10 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
+    pub async fn download_index_file(
+        &self,
+        cancel: CancellationToken,
+    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -482,6 +463,7 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
+            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -629,13 +611,12 @@ impl RemoteTimelineClient {
    ///
    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        self.schedule_layer_file_upload0(upload_queue, layer_file_name, layer_metadata);
+        self.schedule_layer_file_upload0(upload_queue, layer);
        self.launch_queued_tasks(upload_queue);
        Ok(())
    }
@@ -643,18 +624,19 @@ impl RemoteTimelineClient {
    fn schedule_layer_file_upload0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        layer_file_name: &LayerFileName,
-        layer_metadata: &LayerFileMetadata,
+        layer: ResidentLayer,
    ) {
+        let metadata = layer.metadata();
+
        upload_queue
            .latest_files
-            .insert(layer_file_name.clone(), layer_metadata.clone());
+            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
+        info!("scheduled layer file upload {layer}");
+        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
-        info!("scheduled layer file upload {layer_file_name}");
    }

    /// Launch a delete operation in the background.
@@ -667,13 +649,13 @@ impl RemoteTimelineClient {
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: Vec<LayerFileName>,
+        names: &[LayerFileName],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());

        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);

@@ -687,17 +669,17 @@ impl RemoteTimelineClient {
    ///
    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
    /// is invoked on them.
-    #[allow(unused)] // will be used by PR#4938
-    pub(crate) fn schedule_unlinking_of_layers_from_index_part(
-        self: &Arc<Self>,
-        names: Vec<LayerFileName>,
-    ) -> anyhow::Result<()> {
+    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

        // just forget the return value; after uploading the next index_part.json, we can consider
-        // the layer files as "dangling". this is fine however.
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
+        // the layer files as "dangling". this is fine, at worst case we create work for the
+        // scrubber.
+
+        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);

        self.launch_queued_tasks(upload_queue);

@@ -706,26 +688,28 @@ impl RemoteTimelineClient {

    /// Update the remote index file, removing the to-be-deleted files from the index,
    /// allowing scheduling of actual deletions later.
-    fn schedule_unlinking_of_layers_from_index_part0(
+    fn schedule_unlinking_of_layers_from_index_part0<I>(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        names: &[LayerFileName],
-    ) -> Vec<(LayerFileName, Generation)> {
+        names: I,
+    ) -> Vec<(LayerFileName, Generation)>
+    where
+        I: IntoIterator<Item = LayerFileName>,
+    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

        // Decorate our list of names with each name's generation, dropping
-        // makes that are unexpectedly missing from our metadata.
+        // names that are unexpectedly missing from our metadata.
        let with_generations: Vec<_> = names
-            .iter()
+            .into_iter()
            .filter_map(|name| {
-                // Remove from latest_files, learning the file's remote generation in the process
-                let meta = upload_queue.latest_files.remove(name);
+                let meta = upload_queue.latest_files.remove(&name);

                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name.to_owned(), meta.generation))
+                    Some((name, meta.generation))
                } else {
                    // This can only happen if we forgot to to schedule the file upload
                    // before scheduling the delete. Log it because it is a rare/strange
@@ -737,6 +721,17 @@ impl RemoteTimelineClient {
            })
            .collect();

+        #[cfg(feature = "testing")]
+        for (name, gen) in &with_generations {
+            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
+                if &unexpected == gen {
+                    tracing::error!("{name} was unlinked twice with same generation");
+                } else {
+                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
+                }
+            }
+        }
+
        // after unlinking files from the upload_queue.latest_files we must always schedule an
        // index_part update, because that needs to be uploaded before we can actually delete the
        // files.
@@ -748,8 +743,7 @@ impl RemoteTimelineClient {
    }

    /// Schedules deletion for layer files which have previously been unlinked from the
-    /// `index_part.json` with [`Self::schedule_unlinking_of_layers_from_index_part`].
-    #[allow(unused)] // will be used by Layer::drop in PR#4938
+    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
    pub(crate) fn schedule_deletion_of_unlinked(
        self: &Arc<Self>,
        layers: Vec<(LayerFileName, Generation)>,
@@ -771,6 +765,19 @@ impl RemoteTimelineClient {
            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
        }

+        #[cfg(feature = "testing")]
+        for (name, gen) in &with_generations {
+            match upload_queue.dangling_files.remove(name) {
+                Some(same) if &same == gen => { /* expected */ }
+                Some(other) => {
+                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
+                }
+                None => {
+                    tracing::error!("{name} was unlinked but was not dangling");
+                }
+            }
+        }
+
        // schedule the actual deletions
        let op = UploadOp::Delete(Delete {
            layers: with_generations,
@@ -784,19 +791,19 @@ impl RemoteTimelineClient {
    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
    pub(crate) fn schedule_compaction_update(
        self: &Arc<Self>,
-        compacted_from: &[LayerFileName],
-        compacted_to: &[(LayerFileName, LayerFileMetadata)],
+        compacted_from: &[Layer],
+        compacted_to: &[ResidentLayer],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        for (name, m) in compacted_to {
-            self.schedule_layer_file_upload0(upload_queue, name, m);
+        for layer in compacted_to {
+            self.schedule_layer_file_upload0(upload_queue, layer.clone());
        }

-        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, compacted_from);
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);

        Ok(())
@@ -1170,16 +1177,12 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
-                    let path = self
-                        .conf
-                        .timeline_path(&self.tenant_id, &self.timeline_id)
-                        .join(layer_file_name.file_name());
-
+                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        &path,
+                        path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1453,6 +1456,8 @@ impl RemoteTimelineClient {
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
+                        #[cfg(feature = "testing")]
+                        dangling_files: HashMap::default(),
                    };

                    let upload_queue = std::mem::replace(
@@ -1496,13 +1501,6 @@ impl RemoteTimelineClient {
            }
        }
    }
-
-    pub(crate) fn get_layer_metadata(
-        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        self.upload_queue.lock().unwrap().get_layer_metadata(name)
-    }
 }

 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1590,6 +1588,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1732,7 +1731,11 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client.download_index_file().await.unwrap() {
+        let initial_index_part = match client
+            .download_index_file(CancellationToken::new())
+            .await
+            .unwrap()
+        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1758,32 +1761,29 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
-        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
-        let content_1 = dummy_contents("foo");
-        let content_2 = dummy_contents("bar");
-        let content_3 = dummy_contents("baz");

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
+        let layers = [
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
+            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
+        ]
+        .into_iter()
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
+
+            Layer::for_resident(
+                harness.conf,
+                &timeline,
+                name,
+                LayerFileMetadata::new(contents.len() as u64, generation),
+            )
+        }).collect::<Vec<_>>();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[0].clone())
            .unwrap();
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[1].clone())
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1824,7 +1824,11 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client.download_index_file().await.unwrap() {
+        let index_part = match client
+            .download_index_file(CancellationToken::new())
+            .await
+            .unwrap()
+        {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1837,38 +1841,42 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64, generation),
-            )
+            .schedule_layer_file_upload(layers[2].clone())
            .unwrap();
+
+        // this is no longer consistent with how deletion works with Layer::drop, but in this test
+        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
+        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert!(upload_queue.queued_operations.len() == 2);
-            assert!(upload_queue.inprogress_tasks.len() == 1);
-            assert!(upload_queue.num_inprogress_layer_uploads == 1);
-            assert!(upload_queue.num_inprogress_deletions == 0);
-            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
+            assert_eq!(upload_queue.queued_operations.len(), 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
+            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
+                0
+            );
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1882,8 +1890,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1912,6 +1920,13 @@ mod tests {
        )
        .unwrap();

+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+        );
+
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1947,10 +1962,7 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-            )
+            .schedule_layer_file_upload(layer_file_1.clone())
            .unwrap();

        let actual_b = get_bytes_started_stopped();
@@ -2015,7 +2027,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file()
+            .download_index_file(CancellationToken::new())
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use crate::tenant::Generation;
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

@@ -170,53 +170,43 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-) -> anyhow::Result<HashSet<TimelineId>> {
+    cancel: CancellationToken,
+) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let timelines = download_retry(
-        || storage.list_prefixes(Some(&remote_path)),
-        &format!("list prefixes for {tenant_id}"),
+    let listing = download_retry_forever(
+        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
+        &format!("list timelines for {tenant_id}"),
+        cancel,
    )
    .await?;

-    if timelines.is_empty() {
-        anyhow::bail!("no timelines found on the remote storage")
-    }
-
    let mut timeline_ids = HashSet::new();
+    let mut other_prefixes = HashSet::new();

-    for timeline_remote_storage_key in timelines {
-        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
-            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
-            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
-            continue;
-        }
-
+    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name
-            .parse()
-            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
-
-        // list_prefixes is assumed to return unique names. Ensure this here.
-        // NB: it's safer to bail out than warn-log this because the pageserver
-        //     needs to absolutely know about _all_ timelines that exist, so that
-        //     GC knows all the branchpoints. If we skipped over a timeline instead,
-        //     GC could delete a layer that's still needed by that timeline.
-        anyhow::ensure!(
-            !timeline_ids.contains(&timeline_id),
-            "list_prefixes contains duplicate timeline id {timeline_id}"
-        );
-        timeline_ids.insert(timeline_id);
+        match object_name.parse::<TimelineId>() {
+            Ok(t) => timeline_ids.insert(t),
+            Err(_) => other_prefixes.insert(object_name.to_string()),
+        };
    }

-    Ok(timeline_ids)
+    for key in listing.keys {
+        let object_name = key
+            .object_name()
+            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
+        other_prefixes.insert(object_name.to_string());
+    }
+
+    Ok((timeline_ids, other_prefixes))
 }

 async fn do_download_index_part(
@@ -224,10 +214,11 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

-    let index_part_bytes = download_retry(
+    let index_part_bytes = download_retry_forever(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;

@@ -242,6 +233,7 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
+        cancel,
    )
    .await?;

@@ -263,19 +255,28 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
+            .await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_id,
+        timeline_id,
+        my_generation,
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -295,8 +296,14 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_id,
+        timeline_id,
+        my_generation.previous(),
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -340,13 +347,14 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g).await
+            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
+                .await
        }
    }
 }
@@ -376,3 +384,23 @@ where
    )
    .await
 }
+
+async fn download_retry_forever<T, O, F>(
+    op: O,
+    description: &str,
+    cancel: CancellationToken,
+) -> Result<T, DownloadError>
+where
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, DownloadError>>,
+{
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        u32::MAX,
+        description,
+        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
+    )
+    .await
+}
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -98,7 +98,7 @@ impl IndexPart {
    const LATEST_VERSION: usize = 4;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];

    pub const FILE_NAME: &'static str = "index_part.json";

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -72,6 +72,8 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
+            //
+            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,26 +4,21 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer;
 mod layer_desc;
-mod remote_layer;

-use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
 use bytes::Bytes;
-use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
-use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::{Arc, Mutex};
+use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -39,7 +34,8 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-pub use remote_layer::RemoteLayer;
+
+pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -74,7 +70,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from Layer::get_page_reconstruct_data
+/// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -179,26 +175,6 @@ impl LayerAccessStats {
        new
    }

-    /// Creates a clone of `self` and records `new_status` in the clone.
-    ///
-    /// The `new_status` is not recorded in `self`.
-    ///
-    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    pub(crate) fn clone_for_residence_change(
-        &self,
-        new_status: LayerResidenceStatus,
-    ) -> LayerAccessStats {
-        let clone = {
-            let inner = self.0.lock().unwrap();
-            inner.clone()
-        };
-        let new = LayerAccessStats(Mutex::new(clone));
-        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
-        new
-    }
-
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -321,95 +297,12 @@ impl LayerAccessStats {
    }
 }

-/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
-///
-/// All layers should implement a minimal `std::fmt::Debug` without tenant or
-/// timeline names, because those are known in the context of which the layers
-/// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
-    ///
-    /// Return data needed to reconstruct given page at LSN.
-    ///
-    /// It is up to the caller to collect more data from previous layer and
-    /// perform WAL redo, if necessary.
-    ///
-    /// See PageReconstructResult for possible return values. The collected data
-    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call, or a struct with a cached older image of the page if one
-    /// is available. If this returns ValueReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data' to
-    /// collect more data.
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult>;
-}
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

-/// A Layer contains all data in a "rectangle" consisting of a range of keys and
-/// range of LSNs.
-///
-/// There are two kinds of layers, in-memory and on-disk layers. In-memory
-/// layers are used to ingest incoming WAL, and provide fast access to the
-/// recent page versions. On-disk layers are stored as files on disk, and are
-/// immutable. This trait presents the common functionality of in-memory and
-/// on-disk layers.
-///
-/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
-/// A delta layer contains all modifications within a range of LSNs and keys.
-/// An image layer is a snapshot of all the data in a key-range, at a single
-/// LSN.
-pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// File name used for this layer, both in the pageserver's local filesystem
-    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName {
-        self.layer_desc().filename()
-    }
-
-    // Path to the layer file in the local filesystem.
-    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<Utf8PathBuf>;
-
-    /// Permanently remove this layer from disk.
-    fn delete_resident_layer_file(&self) -> Result<()>;
-
-    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        None
-    }
-
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        false
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
-
-    fn access_stats(&self) -> &LayerAccessStats;
-}
-
-pub fn downcast_remote_layer(
-    layer: &Arc<dyn PersistentLayer>,
-) -> Option<std::sync::Arc<RemoteLayer>> {
-    if layer.is_remote_layer() {
-        Arc::clone(layer).downcast_remote_layer()
-    } else {
-        None
-    }
-}
-
 pub mod tests {
    use super::*;

@@ -447,19 +340,6 @@ pub mod tests {
    }
 }

-/// Helper enum to hold a PageServerConf, or a path
-///
-/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
-/// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
-/// struct for a file on disk, without having a page server running, so that we have no
-/// config. In that case, we use the Path variant to hold the full path to the file on
-/// disk.
-enum PathOrConf {
-    Path(Utf8PathBuf),
-    Conf(&'static PageServerConf),
-}
-
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,18 +34,17 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{
-    PersistentLayer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -59,10 +58,7 @@ use utils::{
    lsn::Lsn,
 };

-use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
-};
+use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -182,20 +178,12 @@ impl DeltaKey {
    }
 }

-/// DeltaLayer is the in-memory data structure associated with an on-disk delta
-/// file.
-///
-/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
 pub struct DeltaLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -212,6 +200,8 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

+/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
+/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -221,12 +211,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -236,19 +220,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for DeltaLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -262,40 +233,9 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

-impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
-        );
+        self.desc.dump();

        if !verbose {
            return Ok(());
@@ -303,119 +243,7 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        println!(
-            "index_start_blk: {}, root {}",
-            inner.index_start_blk, inner.index_root_blk
-        );
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
-
-        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    let err: anyhow::Error = err;
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-
-        ensure!(self.desc.key_range.contains(&key));
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-
-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
-    }
-
-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
-        Ok(())
-    }
-
-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        let access_stats = self.access_stats.as_api_model(reset);
-
-        HistoricLayerInfo::Delta {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start: lsn_range.start,
-            lsn_end: lsn_range.end,
-            remote: false,
-            access_stats,
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
-        fname: &DeltaFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.clone(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(tenant_id, timeline_id)
-                .join(fname.to_string()),
-        }
+        inner.dump(ctx).await
    }

    fn temp_path_for(
@@ -461,52 +289,21 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;

-        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(Arc::new(loaded))
    }

-    /// Create a DeltaLayer struct representing an existing file on disk.
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &DeltaFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> DeltaLayer {
-        DeltaLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_delta(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn_range.clone(),
-                file_size,
-            ),
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -520,7 +317,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -533,29 +330,9 @@ impl DeltaLayer {
        })
    }

-    fn layer_name(&self) -> DeltaFileName {
-        self.desc.delta_file_name()
-    }
    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            &self.desc.tenant_id,
-            &self.desc.timeline_id,
-            &self.layer_name(),
-        )
-    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner, ctx)
-            .await
-            .context("Layer index is corrupted")
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -660,7 +437,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -717,37 +494,21 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = DeltaLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc: PersistentLayerDesc::new_delta(
-                self.tenant_id,
-                self.timeline_id,
-                self.key_start..key_end,
-                self.lsn_range.clone(),
-                metadata.len(),
-            ),
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };
+
+        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_start..key_end,
+            self.lsn_range.clone(),
+            metadata.len(),
+        );

        // fsync the file
        file.sync_all().await?;
-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = DeltaLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            &self.tenant_id,
-            &self.timeline_id,
-            &DeltaFileName {
-                key_range: self.key_start..key_end,
-                lsn_range: self.lsn_range,
-            },
-        );
-        std::fs::rename(self.path, &final_path)?;

-        trace!("created delta layer {final_path}");
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+
+        trace!("created delta layer {}", layer.local_path());

        Ok(layer)
    }
@@ -828,8 +589,12 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end).await
+    pub(crate) async fn finish(
+        mut self,
+        key_end: Key,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<ResidentLayer> {
+        self.inner.take().unwrap().finish(key_end, timeline).await
    }
 }

@@ -967,15 +732,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
-        this: &'a T,
-        ctx: &'b RequestContext,
+    pub(super) async fn load_keys<'a>(
+        &'a self,
+        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
+        let file = &self.file;

-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -988,7 +755,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(dl),
+                            Adapter(self),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -1015,10 +782,61 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
+
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        println!(
+            "index_start_blk: {}, root {}",
+            self.index_start_blk, self.index_root_blk
+        );
+
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = self.load_keys(ctx).await?;
+
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1058,3 +876,9 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
+
+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,21 +31,23 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
+use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
+use pageserver_api::models::LayerAccessKind;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -56,7 +58,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};

 ///
 /// Header stored in the beginning of the file
@@ -114,22 +116,14 @@ impl Summary {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
-///
-/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
-/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
-/// Otherwise the struct is just a placeholder for a file that exists on disk,
-/// and it needs to be loaded before using it in queries.
+/// This is used only from `pagectl`. Within pageserver, all layers are
+/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
 pub struct ImageLayer {
-    path_or_conf: PathOrConf,
-
+    path: Utf8PathBuf,
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
-
    access_stats: LayerAccessStats,
-
    inner: OnceCell<ImageLayerInner>,
 }

@@ -146,6 +140,8 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -166,73 +162,11 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
-impl Layer for ImageLayer {
-    /// Look up given page in the file
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
-            .await
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        self.local_path()
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        self.delete_resident_layer_file()
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        self.info(reset)
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        self.access_stats()
-    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-            self.desc.tenant_id,
-            self.desc.timeline_id,
-            self.desc.key_range.start,
-            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental(),
-            self.desc.file_size
-        );
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-        let file = &inner.file;
+impl ImageLayerInner {
+    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        let file = &self.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);

        tree_reader.dump().await?;

@@ -250,69 +184,36 @@ impl ImageLayer {

        Ok(())
    }
+}

-    pub(crate) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.desc.key_range.contains(&key));
-        assert!(lsn_range.start >= self.lsn);
-        assert!(lsn_range.end >= self.lsn);
-
-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state, ctx)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path()))
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
    }
+}

-    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
-        Some(self.path())
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        self.desc.dump();
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+
+        inner.dump(ctx).await?;

-    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
-        // delete underlying file
-        fs::remove_file(self.path())?;
        Ok(())
    }

-    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_start = self.layer_desc().image_layer_lsn();
-
-        HistoricLayerInfo::Image {
-            layer_file_name,
-            layer_file_size: self.desc.file_size,
-            lsn_start,
-            remote: false,
-            access_stats: self.access_stats.as_api_model(reset),
-        }
-    }
-
-    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-
-    fn path_for(
-        path_or_conf: &PathOrConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        fname: &ImageFileName,
-    ) -> Utf8PathBuf {
-        match path_or_conf {
-            PathOrConf::Path(path) => path.to_path_buf(),
-            PathOrConf::Conf(conf) => conf
-                .timeline_path(&tenant_id, &timeline_id)
-                .join(fname.to_string()),
-        }
-    }
-
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -348,54 +249,21 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
-                .await?;
+        // not production code
+        let actual_filename = path.file_name().unwrap().to_owned();
+        let expected_filename = self.layer_desc().filename().file_name();

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
-            }
+        if actual_filename != expected_filename {
+            println!("warning: filename does not match what is expected from in-file summary");
+            println!("actual: {:?}", actual_filename);
+            println!("expected: {:?}", expected_filename);
        }

        Ok(loaded)
    }

-    /// Create an ImageLayer struct representing an existing file on disk
-    pub fn new(
-        conf: &'static PageServerConf,
-        timeline_id: TimelineId,
-        tenant_id: TenantId,
-        filename: &ImageFileName,
-        file_size: u64,
-        access_stats: LayerAccessStats,
-    ) -> ImageLayer {
-        ImageLayer {
-            path_or_conf: PathOrConf::Conf(conf),
-            desc: PersistentLayerDesc::new_img(
-                tenant_id,
-                timeline_id,
-                filename.key_range.clone(),
-                filename.lsn,
-                file_size,
-            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
-            lsn: filename.lsn,
-            access_stats,
-            inner: OnceCell::new(),
-        }
-    }
-
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -407,7 +275,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path_or_conf: PathOrConf::Path(path.to_path_buf()),
+            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -421,18 +289,8 @@ impl ImageLayer {
        })
    }

-    fn layer_name(&self) -> ImageFileName {
-        self.desc.image_file_name()
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> Utf8PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.desc.timeline_id,
-            self.desc.tenant_id,
-            &self.layer_name(),
-        )
+    fn path(&self) -> Utf8PathBuf {
+        self.path.clone()
    }
 }

@@ -604,7 +462,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self) -> anyhow::Result<ImageLayer> {
+    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -658,33 +516,14 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-        let layer = ImageLayer {
-            path_or_conf: PathOrConf::Conf(self.conf),
-            desc,
-            lsn: self.lsn,
-            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
-        };

        // fsync the file
        file.sync_all().await?;

-        // Rename the file to its final name
-        //
-        // Note: This overwrites any existing file. There shouldn't be any.
-        // FIXME: throw an error instead?
-        let final_path = ImageLayer::path_for(
-            &PathOrConf::Conf(self.conf),
-            self.timeline_id,
-            self.tenant_id,
-            &ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn,
-            },
-        );
-        std::fs::rename(self.path, final_path)?;
+        // FIXME: why not carry the virtualfile here, it supports renaming?
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.path());
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -746,8 +585,11 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish().await
+    pub(crate) async fn finish(
+        mut self,
+        timeline: &Arc<Timeline>,
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,11 +10,12 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
+use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -28,7 +29,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayer, DeltaLayerWriter, Layer};
+use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -207,20 +208,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
-impl Layer for InMemoryLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: &mut ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
-            .await
-    }
-}
-
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -229,17 +216,13 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
-    ///
    /// Get layer size.
-    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

-    ///
    /// Create a new, empty, in-memory layer
-    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -331,7 +314,11 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(
+        &self,
+        timeline: &Arc<Timeline>,
+        ctx: &RequestContext,
+    ) -> Result<ResidentLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -376,7 +363,8 @@ impl InMemoryLayer {
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
+        // MAX is used here because we identify L0 layers by full key range
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,4 +1,3 @@
-use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -6,7 +5,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{context::RequestContext, repository::Key};
+use crate::repository::Key;

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -100,6 +99,22 @@ impl PersistentLayerDesc {
        }
    }

+    pub fn from_filename(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
+        match filename {
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => {
+                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+            }
+        }
+    }
+
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -173,21 +188,31 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental(),
-            self.file_size,
-        );
-
-        Ok(())
+    pub fn dump(&self) {
+        if self.is_delta {
+            println!(
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.lsn_range.start,
+                self.lsn_range.end,
+                self.is_incremental(),
+                self.file_size,
+            );
+        } else {
+            println!(
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_id,
+                self.timeline_id,
+                self.key_range.start,
+                self.key_range.end,
+                self.image_layer_lsn(),
+                self.is_incremental(),
+                self.file_size
+            );
+        }
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -1,216 +0,0 @@
-//! A RemoteLayer is an in-memory placeholder for a layer file that exists
-//! in remote storage.
-//!
-use crate::config::PageServerConf;
-use crate::context::RequestContext;
-use crate::repository::Key;
-use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::timeline::layer_manager::LayerManager;
-use anyhow::{bail, Result};
-use camino::Utf8PathBuf;
-use pageserver_api::models::HistoricLayerInfo;
-use std::ops::Range;
-use std::sync::Arc;
-
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::filename::{DeltaFileName, ImageFileName};
-use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
-};
-
-/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`].
-///
-/// RemoteLayer might be downloaded on-demand during operations which are
-/// allowed download remote layers and during which, it gets replaced with a
-/// concrete `DeltaLayer` or `ImageLayer`.
-///
-/// See: [`crate::context::RequestContext`] for authorization to download
-pub struct RemoteLayer {
-    pub desc: PersistentLayerDesc,
-
-    pub layer_metadata: LayerFileMetadata,
-
-    access_stats: LayerAccessStats,
-
-    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
-
-    /// Has `LayerMap::replace` failed for this (true) or not (false).
-    ///
-    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
-    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
-    /// unprocessable, because a LayerMap::replace failed.
-    ///
-    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
-    /// a possible fast loop between `Timeline::get_reconstruct_data` and
-    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
-    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
-}
-
-impl std::fmt::Debug for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.desc.filename())
-            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.desc.is_incremental())
-            .finish()
-    }
-}
-
-#[async_trait::async_trait]
-impl Layer for RemoteLayer {
-    async fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
-    ) -> Result<ValueReconstructResult> {
-        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
-    }
-}
-
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
-    }
-}
-
-impl AsLayerDesc for RemoteLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
-    }
-}
-
-impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<Utf8PathBuf> {
-        None
-    }
-
-    fn delete_resident_layer_file(&self) -> Result<()> {
-        bail!("remote layer has no layer file");
-    }
-
-    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
-        Some(self)
-    }
-
-    fn is_remote_layer(&self) -> bool {
-        true
-    }
-
-    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.layer_desc().filename().file_name();
-        let lsn_range = self.layer_desc().lsn_range.clone();
-
-        if self.desc.is_delta {
-            HistoricLayerInfo::Delta {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                lsn_end: lsn_range.end,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        } else {
-            HistoricLayerInfo::Image {
-                layer_file_name,
-                layer_file_size: self.layer_metadata.file_size(),
-                lsn_start: lsn_range.start,
-                remote: true,
-                access_stats: self.access_stats.as_api_model(reset),
-            }
-        }
-    }
-
-    fn access_stats(&self) -> &LayerAccessStats {
-        &self.access_stats
-    }
-}
-
-impl RemoteLayer {
-    pub fn new_img(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &ImageFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_img(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn,
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    pub fn new_delta(
-        tenantid: TenantId,
-        timelineid: TimelineId,
-        fname: &DeltaFileName,
-        layer_metadata: &LayerFileMetadata,
-        access_stats: LayerAccessStats,
-    ) -> RemoteLayer {
-        RemoteLayer {
-            desc: PersistentLayerDesc::new_delta(
-                tenantid,
-                timelineid,
-                fname.key_range.clone(),
-                fname.lsn_range.clone(),
-                layer_metadata.file_size(),
-            ),
-            layer_metadata: layer_metadata.clone(),
-            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
-            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
-            access_stats,
-        }
-    }
-
-    /// Create a Layer struct representing this layer, after it has been downloaded.
-    pub(crate) fn create_downloaded_layer(
-        &self,
-        _layer_map_lock_held_witness: &LayerManager,
-        conf: &'static PageServerConf,
-        file_size: u64,
-    ) -> Arc<dyn PersistentLayer> {
-        if self.desc.is_delta {
-            let fname = self.desc.delta_file_name();
-            Arc::new(DeltaLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        } else {
-            let fname = self.desc.image_file_name();
-            Arc::new(ImageLayer::new(
-                conf,
-                self.desc.timeline_id,
-                self.desc.tenant_id,
-                &fname,
-                file_size,
-                self.access_stats
-                    .clone_for_residence_change(LayerResidenceStatus::Resident),
-            ))
-        }
-    }
-}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::completion;
+use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
@@ -139,7 +139,10 @@ pub fn start_background_loops(
 /// Compaction task's main loop
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -159,8 +162,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first && random_init_delay(period, &cancel).await.is_err() {
-                break;
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
            }

            let started_at = Instant::now();
@@ -173,23 +179,24 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };

-            if !first {
-                // The first iteration is typically much slower, because all tenants compete for the
-                // compaction sempahore to run, and because of concurrent startup work like initializing
-                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
-                warn_when_period_overrun(
-                    started_at.elapsed(),
-                    period,
-                    BackgroundLoopKind::Compaction,
-                );
-            }
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -198,8 +205,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
-
-            first = false;
        }
    }
    .await;
@@ -210,7 +215,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 /// GC task's main loop
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    let wait_duration = Duration::from_secs(2);
+    const MAX_BACKOFF_SECS: f64 = 300.0;
+    // How many errors we have seen consequtively
+    let mut error_run_count = 0;
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -231,8 +239,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let period = tenant.get_gc_period();

-            if first && random_init_delay(period, &cancel).await.is_err() {
-                break;
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
            }

            let started_at = Instant::now();
@@ -249,19 +260,24 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
                    .await;
                if let Err(e) = res {
-                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
-                    wait_duration
+                    let wait_duration = backoff::exponential_backoff_duration_seconds(
+                        error_run_count,
+                        1.0,
+                        MAX_BACKOFF_SECS,
+                    );
+                    error_run_count += 1;
+                    error!(
+                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        wait_duration
+                    );
+                    Duration::from_secs_f64(wait_duration)
                } else {
+                    error_run_count = 0;
                    period
                }
            };

-            if !first {
-                // The first iteration is typically much slower, because all tenants compete for the
-                // compaction sempahore to run, and because of concurrent startup work like initializing
-                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
-                warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
-            }
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -270,8 +286,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
-
-            first = false;
        }
    }
    .await;
@@ -347,7 +361,7 @@ pub(crate) fn warn_when_period_overrun(
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
        // intelligent. however it makes sense to keep the "configuration format" for period, even
        // though there's no way to output the actual config value.
-        warn!(
+        info!(
            ?elapsed,
            period = %humantime::format_duration(period),
            ?task,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -38,6 +38,14 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    }
    debug!("wal receiver shutdown confirmed");

+    // Shut down the layer flush task before the remote client, as one depends on the other
+    task_mgr::shutdown_tasks(
+        Some(TaskKind::LayerFlushTask),
+        Some(timeline.tenant_id),
+        Some(timeline.timeline_id),
+    )
+    .await;
+
    // Prevent new uploads from starting.
    if let Some(remote_client) = timeline.remote_client.as_ref() {
        let res = remote_client.stop();
@@ -294,6 +302,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
+        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,7 +29,6 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        storage_layer::PersistentLayer,
        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
@@ -210,15 +209,26 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<Arc<dyn PersistentLayer>> = {
+        let candidates: Vec<_> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-                if hist_layer.is_remote_layer() {
-                    continue;
-                }
+
+                // guard against eviction while we inspect it; it might be that eviction_task and
+                // disk_usage_eviction_task both select the same layers to be evicted, and
+                // seemingly free up double the space. both succeeding is of no consequence.
+                let guard = match hist_layer.keep_resident().await {
+                    Ok(Some(l)) => l,
+                    Ok(None) => continue,
+                    Err(e) => {
+                        // these should not happen, but we cannot make them statically impossible right
+                        // now.
+                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
+                        continue;
+                    }
+                };

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -249,7 +259,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(hist_layer)
+                    candidates.push(guard.drop_eviction_guard())
                }
            }
            candidates
@@ -268,7 +278,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
+            .evict_layer_batch(remote_client, &candidates, cancel)
            .await
        {
            Err(pre_err) => {
@@ -279,7 +289,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for (l, result) in candidates.iter().zip(results) {
+        for result in results {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -287,24 +297,10 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::FileNotFound)) => {
+                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
-                }
-                Some(Err(EvictionError::MetadataInconsistency(detail))) => {
-                    warn!(layer = %l, "failed to evict layer: {detail}");
-                    stats.not_evictable += 1;
-                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -12,27 +12,16 @@ use crate::{
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
-            PersistentLayerDesc, PersistentLayerKey,
+            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
+            ResidentLayer,
        },
-        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager,
-}
-
-/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
-/// scheduling deletes in remote client.
-pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
-
-impl ApplyGcResultGuard<'_> {
-    pub(crate) fn flush(self) {
-        self.0.flush();
-    }
+    layer_fmgr: LayerFileManager<Layer>,
 }

 impl LayerManager {
@@ -43,7 +32,7 @@ impl LayerManager {
        }
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -55,21 +44,12 @@ impl LayerManager {
        &self.layer_map
    }

-    /// Replace layers in the layer file manager, used in evictions and layer downloads.
-    pub(crate) fn replace_and_verify(
-        &mut self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        self.layer_fmgr.replace_and_verify(expected, new)
-    }
-
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
+        on_disk_layers: Vec<Layer>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -164,10 +144,19 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
+    pub(crate) fn track_new_image_layers(
+        &mut self,
+        image_layers: &[ResidentLayer],
+        metrics: &TimelineMetrics,
+    ) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+
+            // record these here instead of Layer::finish_creating because otherwise partial
+            // failure with create_image_layers would balloon up the physical size gauge. downside
+            // is that all layers need to be created before metrics are updated.
+            metrics.record_new_file_metrics(layer.layer_desc().file_size);
        }
        updates.flush();
    }
@@ -175,76 +164,71 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<DeltaLayer>,
+        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
+        metrics: &TimelineMetrics,
    ) {
-        let l = self.layer_map.frozen_layers.pop_front();
-        let mut updates = self.layer_map.batch_update();
+        let inmem = self
+            .layer_map
+            .frozen_layers
+            .pop_front()
+            .expect("there must be a inmem layer to flush");

-        // Only one thread may call this function at a time (for this
-        // timeline). If two threads tried to flush the same frozen
+        // Only one task may call this function at a time (for this
+        // timeline). If two tasks tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
+        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));

-        if let Some(delta_layer) = delta_layer {
-            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
+        if let Some(l) = delta_layer {
+            let mut updates = self.layer_map.batch_update();
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
+            updates.flush();
        }
-        updates.flush();
    }

    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: Vec<Arc<dyn PersistentLayer>>,
-        compact_to: Vec<Arc<dyn PersistentLayer>>,
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: &[Layer],
+        compact_to: &[ResidentLayer],
        metrics: &TimelineMetrics,
-    ) -> Result<()> {
+    ) {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
+            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
-            // NB: the layer file identified by descriptor `l` is guaranteed to be present
-            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
-            // time, even though we dropped `Timeline::layers` inbetween.
-            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                l,
-                &mut updates,
-                metrics,
-                &mut self.layer_fmgr,
-            )?;
+            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
-        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Arc<dyn PersistentLayer>>,
-        metrics: &TimelineMetrics,
-    ) -> Result<ApplyGcResultGuard> {
+        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Layer>,
+    ) {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs.clone(),
-                doomed_layer,
+                layer_removal_cs,
+                &doomed_layer,
                &mut updates,
-                metrics,
                &mut self.layer_fmgr,
-            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
+            );
        }
-        Ok(ApplyGcResultGuard(updates))
+        updates.flush()
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Arc<dyn PersistentLayer>,
+        layer: Layer,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
+        mapping: &mut LayerFileManager<Layer>,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
@@ -254,17 +238,12 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: Arc<dyn PersistentLayer>,
+        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: &Layer,
        updates: &mut BatchedUpdates<'_>,
-        metrics: &TimelineMetrics,
-        mapping: &mut LayerFileManager,
-    ) -> anyhow::Result<()> {
+        mapping: &mut LayerFileManager<Layer>,
+    ) {
        let desc = layer.layer_desc();
-        if !layer.is_remote_layer() {
-            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_sub(desc.file_size);
-        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -273,21 +252,18 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-
-        Ok(())
+        layer.garbage_collect_on_drop();
    }

-    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
+    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
-    HashMap<PersistentLayerKey, Arc<T>>,
-);
+pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

-impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
+impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -297,14 +273,14 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: Arc<T>) {
+    pub(crate) fn insert(&mut self, layer: T) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
+    pub(crate) fn contains(&self, layer: &T) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -312,7 +288,7 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: Arc<T>) {
+    pub(crate) fn remove(&mut self, layer: &T) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -321,39 +297,4 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
            )
        }
    }
-
-    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
-        let key = expected.layer_desc().key();
-        let other = new.layer_desc().key();
-
-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "layermap-replace-notfound"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new layer have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-        );
-
-        if let Some(layer) = self.0.get_mut(&key) {
-            anyhow::ensure!(
-                compare_arced_layers(&expected, layer),
-                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!("layer was not found");
-        }
-    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -122,7 +122,7 @@ pub(super) async fn handle_walreceiver_connection(
    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");

-    let (mut replication_client, connection) = {
+    let (replication_client, connection) = {
        let mut config = wal_source_connconf.to_tokio_postgres_config();
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
@@ -205,7 +205,7 @@ pub(super) async fn handle_walreceiver_connection(
        gauge.dec();
    }

-    let identify = identify_system(&mut replication_client).await?;
+    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");

    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
@@ -444,7 +444,7 @@ struct IdentifySystem {
 struct IdentifyError;

 /// Run the postgres `IDENTIFY_SYSTEM` command
-async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem> {
+async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {
    let query_str = "IDENTIFY_SYSTEM";
    let response = client.simple_query(query_str).await?;

@@ -459,7 +459,7 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>

    // extract the row contents into an IdentifySystem struct.
    // written as a closure so I can use ? for Option here.
-    if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
+    if let Some(SimpleQueryMessage::Row(first_row)) = response.first() {
        Ok(IdentifySystem {
            systemid: get_parse(first_row, 0)?,
            timeline: get_parse(first_row, 1)?,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,5 @@
 use super::storage_layer::LayerFileName;
+use super::storage_layer::ResidentLayer;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -79,6 +80,14 @@ pub(crate) struct UploadQueueInitialized {
    /// tasks to finish. For example, metadata upload cannot be performed before all
    /// preceding layer file uploads have completed.
    pub(crate) queued_operations: VecDeque<UploadOp>,
+
+    /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
+    /// for error logging.
+    ///
+    /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
+    /// bug causing leaks, then it's better to not leave this enabled for production builds.
+    #[cfg(feature = "testing")]
+    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
 }

 impl UploadQueueInitialized {
@@ -135,6 +144,8 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
+            #[cfg(feature = "testing")]
+            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -180,6 +191,8 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
+            #[cfg(feature = "testing")]
+            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -203,18 +216,6 @@ impl UploadQueue {
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
-
-    pub(crate) fn get_layer_metadata(
-        &self,
-        name: &LayerFileName,
-    ) -> anyhow::Result<Option<LayerFileMetadata>> {
-        match self {
-            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
-        }
-    }
 }

 /// An in-progress upload or delete task.
@@ -237,7 +238,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(LayerFileName, LayerFileMetadata),
+    UploadLayer(ResidentLayer, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -252,13 +253,13 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(path, metadata) => {
+            UploadOp::UploadLayer(layer, metadata) => {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    path.file_name(),
+                    layer,
                    metadata.file_size(),
-                    metadata.generation,
+                    metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
+use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -173,37 +174,78 @@ impl OpenFiles {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub enum CrashsafeOverwriteError {
-    #[error("final path has no parent dir")]
-    FinalPathHasNoParentDir,
-    #[error("remove tempfile")]
-    RemovePreviousTempfile(#[source] std::io::Error),
-    #[error("create tempfile")]
-    CreateTempfile(#[source] std::io::Error),
-    #[error("write tempfile")]
-    WriteContents(#[source] std::io::Error),
-    #[error("sync tempfile")]
-    SyncTempfile(#[source] std::io::Error),
-    #[error("rename tempfile to final path")]
-    RenameTempfileToFinalPath(#[source] std::io::Error),
-    #[error("open final path parent dir")]
-    OpenFinalPathParentDir(#[source] std::io::Error),
-    #[error("sync final path parent dir")]
-    SyncFinalPathParentDir(#[source] std::io::Error),
+/// Identify error types that should alwways terminate the process.  Other
+/// error types may be elegible for retry.
+pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
+    use nix::errno::Errno::*;
+    match e.raw_os_error().map(nix::errno::from_i32) {
+        Some(EIO) => {
+            // Terminate on EIO because we no longer trust the device to store
+            // data safely, or to uphold persistence guarantees on fsync.
+            true
+        }
+        Some(EROFS) => {
+            // Terminate on EROFS because a filesystem is usually remounted
+            // readonly when it has experienced some critical issue, so the same
+            // logic as EIO applies.
+            true
+        }
+        Some(EACCES) => {
+            // Terminate on EACCESS because we should always have permissions
+            // for our own data dir: if we don't, then we can't do our job and
+            // need administrative intervention to fix permissions.  Terminating
+            // is the best way to make sure we stop cleanly rather than going
+            // into infinite retry loops, and will make it clear to the outside
+            // world that we need help.
+            true
+        }
+        _ => {
+            // Treat all other local file I/O errors are retryable.  This includes:
+            // - ENOSPC: we stay up and wait for eviction to free some space
+            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
+            // - WriteZero, Interrupted: these are used internally VirtualFile
+            false
+        }
+    }
 }
-impl CrashsafeOverwriteError {
-    /// Returns true iff the new contents are durably stored.
-    pub fn are_new_contents_durable(&self) -> bool {
+
+/// Call this when the local filesystem gives us an error with an external
+/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
+/// bad storage or bad configuration, and we can't fix that from inside
+/// a running process.
+pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
+    tracing::error!("Fatal I/O error: {e}: {context})");
+    std::process::abort();
+}
+
+pub(crate) trait MaybeFatalIo<T> {
+    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
+    fn fatal_err(self, context: &str) -> T;
+}
+
+impl<T> MaybeFatalIo<T> for std::io::Result<T> {
+    /// Terminate the process if the result is an error of a fatal type, else pass it through
+    ///
+    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
+    /// not on ENOSPC.
+    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
+        if let Err(e) = &self {
+            if is_fatal_io_error(e) {
+                on_fatal_io_error(e, context);
+            }
+        }
+        self
+    }
+
+    /// Terminate the process on any I/O error.
+    ///
+    /// This is appropriate for reads on files that we know exist: they should always work.
+    fn fatal_err(self, context: &str) -> T {
        match self {
-            Self::FinalPathHasNoParentDir => false,
-            Self::RemovePreviousTempfile(_) => false,
-            Self::CreateTempfile(_) => false,
-            Self::WriteContents(_) => false,
-            Self::SyncTempfile(_) => false,
-            Self::RenameTempfileToFinalPath(_) => false,
-            Self::OpenFinalPathParentDir(_) => false,
-            Self::SyncFinalPathParentDir(_) => true,
+            Ok(v) => v,
+            Err(e) => {
+                on_fatal_io_error(&e, context);
+            }
        }
    }
 }
@@ -284,15 +326,13 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> Result<(), CrashsafeOverwriteError> {
+    ) -> std::io::Result<()> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
+            return Err(std::io::Error::from_raw_os_error(
+                nix::errno::Errno::EINVAL as i32,
+            ));
        };
-        match std::fs::remove_file(tmp_path) {
-            Ok(()) => {}
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
-        }
+        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -301,31 +341,20 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await
-        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
-        file.write_all(content)
-            .await
-            .map_err(CrashsafeOverwriteError::WriteContents)?;
-        file.sync_all()
-            .await
-            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
+        .await?;
+        file.write_all(content).await?;
+        file.sync_all().await?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)
-            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
+        std::fs::rename(tmp_path, final_path)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
-                .await
-                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
-        final_parent_dirfd
-            .sync_all()
-            .await
-            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
+        final_parent_dirfd.sync_all().await?;
        Ok(())
    }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -443,7 +443,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
+        decoded: &DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
@@ -749,7 +749,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &mut DecodedWALRecord,
+        decoded: &DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -857,7 +857,8 @@ impl WalRedoProcess {
            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            } else if in_revents.contains(PollFlags::POLLHUP) {
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
            }
@@ -907,7 +908,8 @@ impl WalRedoProcess {
                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                } else if out_revents.contains(PollFlags::POLLHUP) {
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
                }
            }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,6 +19,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "c.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -63,6 +64,21 @@ int			max_reconnect_attempts = 60;
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
+static void pageserver_disconnect(void);
+
+
+static pqsigfunc	 prev_signal_handler;
+
+static void
+pageserver_sighup_handler(SIGNAL_ARGS)
+{
+	if (prev_signal_handler)
+	{
+        	prev_signal_handler(postgres_signal_arg);
+	}
+	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
+	pageserver_disconnect();
+}

 static bool
 pageserver_connect(int elevel)
@@ -400,7 +416,7 @@ pg_init_libpagestore(void)
 							   NULL,
 							   &page_server_connstring,
 							   "",
-							   PGC_POSTMASTER,
+							   PGC_SIGHUP,
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);

@@ -482,5 +498,8 @@ pg_init_libpagestore(void)
 		old_redo_read_buffer_filter = redo_read_buffer_filter;
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
+
+        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
+
 	lfc_init();
 }
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -201,6 +201,16 @@ WalRedoMain(int argc, char *argv[])
 #endif

 	am_wal_redo_postgres = true;
+	/*
+	 * Pageserver treats any output to stderr as an ERROR, so we must
+	 * set the log level as early as possible to only log FATAL and 
+	 * above during WAL redo (note that loglevel ERROR also logs LOG,
+	 * which is super strange but that's not something we can solve
+	 * for here. ¯\_(-_-)_/¯
+	 */
+	SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE);
+	SetConfigOption("client_min_messages", "ERROR", PGC_SUSET,
+					PGC_S_OVERRIDE);

 	/*
 	 * WAL redo does not need a large number of buffers. And speed of
@@ -885,7 +895,12 @@ apply_error_callback(void *arg)
 	StringInfoData buf;

 	initStringInfo(&buf);
-	xlog_outdesc(&buf, record);
+#if PG_VERSION_NUM >= 150000
+	if (record->record)
+#else
+	if (record->decoded_record)
+#endif
+		xlog_outdesc(&buf, record);

 	/* translator: %s is a WAL record description */
 	errcontext("WAL redo at %X/%X for %s",
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -5,7 +5,7 @@ mod link;
 pub use link::LinkAuthError;
 use tokio_postgres::config::AuthKeys;

-use crate::proxy::{handle_try_wake, retry_after};
+use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -134,13 +134,14 @@ async fn auth_quirks_creds(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
    if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(creds, client).await;
+        return hacks::password_hack(creds, client, latency_timer).await;
    }

    // Password hack should set the project name.
@@ -151,11 +152,11 @@ async fn auth_quirks_creds(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(client).await;
+        return hacks::cleartext_hack(client, latency_timer).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client, config).await
+    classic::authenticate(api, extra, creds, client, config, latency_timer).await
 }

 /// True to its name, this function encapsulates our current auth trade-offs.
@@ -167,8 +168,18 @@ async fn auth_quirks(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let auth_stuff = auth_quirks_creds(api, extra, creds, client, allow_cleartext, config).await?;
+    let auth_stuff = auth_quirks_creds(
+        api,
+        extra,
+        creds,
+        client,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await?;

    let mut num_retries = 0;
    let mut node = loop {
@@ -233,6 +244,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
+        latency_timer: &mut LatencyTimer,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

@@ -245,7 +257,16 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
+                auth_quirks(
+                    api,
+                    extra,
+                    creds,
+                    client,
+                    allow_cleartext,
+                    config,
+                    latency_timer,
+                )
+                .await?
            }
            Postgres(api, creds) => {
                info!(
@@ -255,7 +276,16 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
+                auth_quirks(
+                    api,
+                    extra,
+                    creds,
+                    client,
+                    allow_cleartext,
+                    config,
+                    latency_timer,
+                )
+                .await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,6 +4,7 @@ use crate::{
    compute,
    config::AuthenticationConfig,
    console::{self, AuthInfo, ConsoleReqExtra},
+    proxy::LatencyTimer,
    sasl, scram,
    stream::PqStream,
 };
@@ -16,6 +17,7 @@ pub(super) async fn authenticate(
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
@@ -36,24 +38,26 @@ pub(super) async fn authenticate(
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);

-            let auth_flow = flow.begin(scram).await.map_err(|error| {
-                warn!(?error, "error sending scram acknowledgement");
-                error
-            })?;
-
            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
-                auth_flow.authenticate(),
+                async {
+                    // pause the timer while we communicate with the client
+                    let _paused = latency_timer.pause();
+
+                    flow.begin(scram).await.map_err(|error| {
+                        warn!(?error, "error sending scram acknowledgement");
+                        error
+                    })?.authenticate().await.map_err(|error| {
+                        warn!(?error, "error processing scram messages");
+                        error
+                    })
+                }
            )
            .await
            .map_err(|error| {
                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
-            })?
-            .map_err(|error| {
-                warn!(?error, "error processing scram messages");
-                error
-            })?;
+            })??;

            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,6 +1,7 @@
 use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
+    proxy::LatencyTimer,
    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -12,8 +13,13 @@ use tracing::{info, warn};
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");
+
+    // pause the timer while we communicate with the client
+    let _paused = latency_timer.pause();
+
    let password = AuthFlow::new(client)
        .begin(auth::CleartextPassword)
        .await?
@@ -32,8 +38,13 @@ pub async fn cleartext_hack(
 pub async fn password_hack(
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");
+
+    // pause the timer while we communicate with the client
+    let _paused = latency_timer.pause();
+
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
        .await?
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -16,9 +16,10 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
-use utils::{project_git_version, sentry_init::init_sentry};
+use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 use clap::{Parser, ValueEnum};

@@ -100,7 +101,8 @@ async fn main() -> anyhow::Result<()> {
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

    info!("Version: {GIT_VERSION}");
-    ::metrics::set_build_info_metric(GIT_VERSION);
+    info!("Build_tag: {BUILD_TAG}");
+    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -223,7 +223,7 @@ pub struct CacheOptions {

 impl CacheOptions {
    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -97,8 +97,6 @@ pub mod errors {
                    !text.contains("quota exceeded")
                        && !text.contains("the limit for current plan reached")
                }
-                // retry server errors
-                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
            }
        }
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -59,7 +59,7 @@ impl Api {
            let rows = client.query(query, &[&creds.user]).await?;

            // We can get at most one row, because `rolname` is unique.
-            let row = match rows.get(0) {
+            let row = match rows.first() {
                Some(row) => row,
                // This means that the user doesn't exist, so there can be no secret.
                // However, this is still a *valid* outcome which is very similar
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -49,7 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", &self.jwt)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", &self.jwt)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -106,17 +106,26 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
 });

 pub struct LatencyTimer {
-    start: Instant,
+    // time since the stopwatch was started
+    start: Option<Instant>,
+    // accumulated time on the stopwatch
+    accumulated: std::time::Duration,
+    // label data
    protocol: &'static str,
    cache_miss: bool,
    pool_miss: bool,
    outcome: &'static str,
 }

+pub struct LatencyTimerPause<'a> {
+    timer: &'a mut LatencyTimer,
+}
+
 impl LatencyTimer {
    pub fn new(protocol: &'static str) -> Self {
        Self {
-            start: Instant::now(),
+            start: Some(Instant::now()),
+            accumulated: std::time::Duration::ZERO,
            protocol,
            cache_miss: false,
            // by default we don't do pooling
@@ -126,6 +135,13 @@ impl LatencyTimer {
        }
    }

+    pub fn pause(&mut self) -> LatencyTimerPause<'_> {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+        LatencyTimerPause { timer: self }
+    }
+
    pub fn cache_miss(&mut self) {
        self.cache_miss = true;
    }
@@ -139,9 +155,17 @@ impl LatencyTimer {
    }
 }

+impl Drop for LatencyTimerPause<'_> {
+    fn drop(&mut self) {
+        // start the stopwatch again
+        self.timer.start = Some(Instant::now());
+    }
+}
+
 impl Drop for LatencyTimer {
    fn drop(&mut self) {
-        let duration = self.start.elapsed().as_secs_f64();
+        let duration =
+            self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
        COMPUTE_CONNECTION_LATENCY
            .with_label_values(&[
                self.protocol,
@@ -149,7 +173,7 @@ impl Drop for LatencyTimer {
                bool_to_str(self.pool_miss),
                self.outcome,
            ])
-            .observe(duration)
+            .observe(duration.as_secs_f64())
    }
 }

@@ -171,7 +195,7 @@ static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes_per_client",
        "Number of bytes sent/received between client and backend.",
@@ -180,6 +204,15 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

+static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes",
+        "Number of bytes sent/received between all clients and backends.",
+        &["direction"],
+    )
+    .unwrap()
+});
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    listener: tokio::net::TcpListener,
@@ -764,24 +797,28 @@ pub async fn proxy_pass(
        branch_id: aux.branch_id.to_string(),
    });

-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
    let mut client = MeasuredStream::new(
        client,
        |_| {},
        |cnt| {
            // Number of bytes we sent to the client (outbound).
            m_sent.inc_by(cnt as u64);
+            m_sent2.inc_by(cnt as u64);
            usage.record_egress(cnt as u64);
        },
    );

-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
    let mut compute = MeasuredStream::new(
        compute,
        |_| {},
        |cnt| {
            // Number of bytes the client sent to the compute node (inbound).
            m_recv.inc_by(cnt as u64);
+            m_recv2.inc_by(cnt as u64);
        },
    );

@@ -849,10 +886,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            application_name: params.get("application_name"),
        };

-        let latency_timer = LatencyTimer::new(mode.protocol_label());
+        let mut latency_timer = LatencyTimer::new(mode.protocol_label());

        let auth_result = match creds
-            .authenticate(&extra, &mut stream, mode.allow_cleartext(), config)
+            .authenticate(
+                &extra,
+                &mut stream,
+                mode.allow_cleartext(),
+                config,
+                &mut latency_timer,
+            )
            .await
        {
            Ok(auth_result) => auth_result,
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -409,7 +409,7 @@ impl TestBackend for TestConnectMechanism {
            }
            ConnectAction::WakeRetry => {
                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
+                    status: http::StatusCode::BAD_REQUEST,
                    text: "TEST".into(),
                };
                assert!(err.could_retry());
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -18,7 +18,6 @@ mod password;
 pub use exchange::Exchange;
 pub use key::ScramKey;
 pub use secret::ServerSecret;
-pub use secret::*;

 use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -25,57 +25,64 @@ _This section is only relevant if using a command that requires access to Neon's

 ### Commands

-#### `tidy`
+#### `find-garbage`

-Iterate over S3 buckets for storage nodes, checking their contents and removing the data not present in the console. Node S3 data that's not removed is then further checked for discrepancies and, sometimes, validated.
-
-Unless the global `--delete` argument is provided, this command only dry-runs and logs
-what it would have deleted.
-
-```
-tidy --node-kind=<safekeeper|pageserver> [--depth=<tenant|timeline>] [--skip-validation]
-```
+Walk an S3 bucket and cross-reference the contents with the Console API to identify data for
+tenants or timelines that should no longer exist.

 - `--node-kind`: whether to inspect safekeeper or pageserver bucket prefix
 - `--depth`: whether to only search for deletable tenants, or also search for
  deletable timelines within active tenants. Default: `tenant`
- `--skip-validation`: skip additional post-deletion checks. Default: `false`
+- `--output-path`: filename to write garbage list to.  Default `garbage.json`

-For a selected S3 path, the tool lists the S3 bucket given for either tenants or both tenants and timelines — for every found entry, console API is queried: any deleted or missing in the API entity is scheduled for deletion from S3.
+This command outputs a JSON file describing tenants and timelines to remove, for subsequent
+processing by the `purge-garbage` subcommand.

-If validation is enabled, only the non-deleted tenants' ones are checked.
-For pageserver, timelines' index_part.json on S3 is also checked for various discrepancies: no files are removed, even if there are "extra" S3 files not present in index_part.json: due to the way pageserver updates the remote storage, it's better to do such removals manually, stopping the corresponding tenant first.
+**Note that the garbage list format is not stable.  The output of `find-garbage` is only
+  intended for use by the exact same version of the tool running `purge-garbage`**

-Command examples:
+Example:

-`env SSO_ACCOUNT_ID=369495373322 REGION=eu-west-1 BUCKET=neon-dev-storage-eu-west-1 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=safekeeper`
+`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`

-`env SSO_ACCOUNT_ID=369495373322 REGION=us-east-2 BUCKET=neon-staging-storage-us-east-2 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=pageserver --depth=timeline`
+#### `purge-garbage`

-When dry run stats look satisfying, use `-- --delete` before the `tidy` command to
-disable dry run and run the binary with deletion enabled.
+Consume a garbage list from `find-garbage`, and delete the related objects in the S3 bucket.

-See these lines (and lines around) in the logs for the final stats:
+- `--input-path`: filename to read garbage list from.  Default `garbage.json`.
+- `--mode`: controls whether to purge only garbage that was specifically marked
+            deleted in the control plane (`deletedonly`), or also to purge tenants/timelines
+            that were not present in the control plane at all (`deletedandmissing`)

- `Finished listing the bucket for tenants`
- `Finished active tenant and timeline validation`
- `Total tenant deletion stats`
- `Total timeline deletion stats`
+This command learns region/bucket details from the garbage file, so it is not necessary
+to pass them on the command line

-## Current implementation details
+Example:

- The tool does not have any peristent state currently: instead, it creates very verbose logs, with every S3 delete request logged, every tenant/timeline id check, etc.
-  Worse, any panic or early errored tasks might force the tool to exit without printing the final summary — all affected ids will still be in the logs though. The tool has retries inside it, so it's error-resistant up to some extent, and recent runs showed no traces of errors/panics.
+`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`

- Instead of checking non-deleted tenants' timelines instantly, the tool attempts to create separate tasks (futures) for that,
-  complicating the logic and slowing down the process, this should be fixed and done in one "task".
+Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
+not provided inline in the example above to avoid accidents.  Without the `--delete` flag
+the purge command will log all the keys that it would have deleted.

- The tool does uses only publicly available remote resources (S3, console) and does not access pageserver/safekeeper nodes themselves.
-  Yet, its S3 set up should be prepared for running on any pageserver/safekeeper node, using node's S3 credentials, so the node API access logic could be implemented relatively simply on top.
+#### `scan-metadata`

-## Cleanup procedure:
+Walk objects in a pageserver S3 bucket, and report statistics on the contents.

-### Pageserver preparations
+```
+env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata
+
+Timelines: 31106
+With errors: 3
+With warnings: 13942
+With garbage: 0
+Index versions: 2: 13942, 4: 17162
+Timeline size bytes: min 22413312, 1% 52133887, 10% 56459263, 50% 101711871, 90% 191561727, 99% 280887295, max 167535558656
+Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 201457663, max 275324928
+Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053
+```
+
+## Cleaning up running pageservers

 If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.

--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,178 +1,27 @@
-use std::collections::{hash_map, HashMap, HashSet};
-use std::sync::Arc;
-use std::time::Duration;
+use std::collections::HashSet;

 use anyhow::Context;
 use aws_sdk_s3::Client;
-use tokio::task::JoinSet;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};

-use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId};
-use crate::delete_batch_producer::DeleteProducerStats;
-use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES};
+use crate::cloud_admin_api::BranchData;
+use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::IndexPart;
 use utils::id::TenantTimelineId;

-pub async fn validate_pageserver_active_tenant_and_timelines(
-    s3_client: Arc<Client>,
-    s3_root: RootTarget,
-    admin_client: Arc<CloudAdminApiClient>,
-    batch_producer_stats: DeleteProducerStats,
-) -> anyhow::Result<BranchCheckStats> {
-    let Some(timeline_stats) = batch_producer_stats.timeline_stats else {
-        info!("No tenant-only checks, exiting");
-        return Ok(BranchCheckStats::default());
-    };
-
-    let s3_active_projects = batch_producer_stats
-        .tenant_stats
-        .active_entries
-        .into_iter()
-        .map(|project| (project.id.clone(), project))
-        .collect::<HashMap<_, _>>();
-    info!("Validating {} active tenants", s3_active_projects.len());
-
-    let mut s3_active_branches_per_project = HashMap::<ProjectId, Vec<BranchData>>::new();
-    let mut s3_blob_data = HashMap::<TenantTimelineId, S3TimelineBlobData>::new();
-    for active_branch in timeline_stats.active_entries {
-        let active_project_id = active_branch.project_id.clone();
-        let active_branch_id = active_branch.id.clone();
-        let active_timeline_id = active_branch.timeline_id;
-
-        s3_active_branches_per_project
-            .entry(active_project_id.clone())
-            .or_default()
-            .push(active_branch);
-
-        let Some(active_project) = s3_active_projects.get(&active_project_id) else {
-            error!(
-                "Branch {:?} for project {:?} has no such project in the active projects",
-                active_branch_id, active_project_id
-            );
-            continue;
-        };
-
-        let id = TenantTimelineId::new(active_project.tenant, active_timeline_id);
-        s3_blob_data.insert(
-            id,
-            list_timeline_blobs(&s3_client, id, &s3_root)
-                .await
-                .with_context(|| format!("List timeline {id} blobs"))?,
-        );
-    }
-
-    let mut branch_checks = JoinSet::new();
-    for (_, s3_active_project) in s3_active_projects {
-        let project_id = &s3_active_project.id;
-        let tenant_id = s3_active_project.tenant;
-
-        let mut console_active_branches =
-            branches_for_project_with_retries(&admin_client, project_id)
-                .await
-                .with_context(|| {
-                    format!("Client API branches for project {project_id:?} retrieval")
-                })?
-                .into_iter()
-                .map(|branch| (branch.id.clone(), branch))
-                .collect::<HashMap<_, _>>();
-
-        let active_branches = s3_active_branches_per_project
-            .remove(project_id)
-            .unwrap_or_default();
-        info!(
-            "Spawning tasks for {} tenant {} active timelines",
-            active_branches.len(),
-            tenant_id
-        );
-        for s3_active_branch in active_branches {
-            let console_branch = console_active_branches.remove(&s3_active_branch.id);
-            let timeline_id = s3_active_branch.timeline_id;
-            let id = TenantTimelineId::new(tenant_id, timeline_id);
-            let s3_data = s3_blob_data.remove(&id);
-            let s3_root = s3_root.clone();
-            branch_checks.spawn(
-                async move {
-                    let check_errors = branch_cleanup_and_check_errors(
-                        &id,
-                        &s3_root,
-                        Some(&s3_active_branch),
-                        console_branch,
-                        s3_data,
-                    )
-                    .await;
-                    (id, check_errors)
-                }
-                .instrument(info_span!("check_timeline", id = %id)),
-            );
-        }
-    }
-
-    let mut total_stats = BranchCheckStats::default();
-    while let Some((id, analysis)) = branch_checks
-        .join_next()
-        .await
-        .transpose()
-        .context("branch check task join")?
-    {
-        total_stats.add(id, analysis.errors);
-    }
-    Ok(total_stats)
-}
-
-async fn branches_for_project_with_retries(
-    admin_client: &CloudAdminApiClient,
-    project_id: &ProjectId,
-) -> anyhow::Result<Vec<BranchData>> {
-    for _ in 0..MAX_RETRIES {
-        match admin_client.branches_for_project(project_id, false).await {
-            Ok(branches) => return Ok(branches),
-            Err(e) => {
-                error!("admin list branches for project {project_id:?} query failed: {e}");
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    }
-
-    anyhow::bail!("Failed to list branches for project {project_id:?} {MAX_RETRIES} times")
-}
-
-#[derive(Debug, Default)]
-pub struct BranchCheckStats {
-    pub timelines_with_errors: HashMap<TenantTimelineId, Vec<String>>,
-    pub normal_timelines: HashSet<TenantTimelineId>,
-}
-
-impl BranchCheckStats {
-    pub fn add(&mut self, id: TenantTimelineId, check_errors: Vec<String>) {
-        if check_errors.is_empty() {
-            if !self.normal_timelines.insert(id) {
-                panic!("Checking branch with timeline {id} more than once")
-            }
-        } else {
-            match self.timelines_with_errors.entry(id) {
-                hash_map::Entry::Occupied(_) => {
-                    panic!("Checking branch with timeline {id} more than once")
-                }
-                hash_map::Entry::Vacant(v) => {
-                    v.insert(check_errors);
-                }
-            }
-        }
-    }
-}
-
-pub struct TimelineAnalysis {
+pub(crate) struct TimelineAnalysis {
    /// Anomalies detected
-    pub errors: Vec<String>,
+    pub(crate) errors: Vec<String>,

    /// Healthy-but-noteworthy, like old-versioned structures that are readable but
    /// worth reporting for awareness that we must not remove that old version decoding
    /// yet.
-    pub warnings: Vec<String>,
+    pub(crate) warnings: Vec<String>,

-    /// Keys not referenced in metadata: candidates for removal
-    pub garbage_keys: Vec<String>,
+    /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
+    /// of races between reading the metadata and reading the objects.
+    pub(crate) garbage_keys: Vec<String>,
 }

 impl TimelineAnalysis {
@@ -185,7 +34,7 @@ impl TimelineAnalysis {
    }
 }

-pub async fn branch_cleanup_and_check_errors(
+pub(crate) async fn branch_cleanup_and_check_errors(
    id: &TenantTimelineId,
    s3_root: &RootTarget,
    s3_active_branch: Option<&BranchData>,
@@ -320,13 +169,13 @@ pub async fn branch_cleanup_and_check_errors(
 }

 #[derive(Debug)]
-pub struct S3TimelineBlobData {
-    pub blob_data: BlobDataParseResult,
-    pub keys_to_remove: Vec<String>,
+pub(crate) struct S3TimelineBlobData {
+    pub(crate) blob_data: BlobDataParseResult,
+    pub(crate) keys_to_remove: Vec<String>,
 }

 #[derive(Debug)]
-pub enum BlobDataParseResult {
+pub(crate) enum BlobDataParseResult {
    Parsed {
        index_part: IndexPart,
        s3_layers: HashSet<LayerFileName>,
@@ -334,7 +183,7 @@ pub enum BlobDataParseResult {
    Incorrect(Vec<String>),
 }

-pub async fn list_timeline_blobs(
+pub(crate) async fn list_timeline_blobs(
    s3_client: &Client,
    id: TenantTimelineId,
    s3_root: &RootTarget,
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,12 +1,19 @@
 #![allow(unused)]

+use std::str::FromStr;
+use std::time::Duration;
+
 use chrono::{DateTime, Utc};
-use reqwest::{header, Client, Url};
+use hex::FromHex;
+use reqwest::{header, Client, StatusCode, Url};
+use serde::Deserialize;
 use tokio::sync::Semaphore;

 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use crate::ConsoleConfig;
+
 #[derive(Debug)]
 pub struct Error {
    context: String,
@@ -34,6 +41,9 @@ impl std::fmt::Display for Error {
                    self.context, e
                )
            }
+            ErrorKind::ResponseStatus(status) => {
+                write!(f, "Bad response status {}: {}", status, self.context)
+            }
            ErrorKind::UnexpectedState => write!(f, "Unexpected state: {}", self.context),
        }
    }
@@ -53,6 +63,7 @@ impl std::error::Error for Error {}
 pub enum ErrorKind {
    RequestSend(reqwest::Error),
    BodyRead(reqwest::Error),
+    ResponseStatus(StatusCode),
    UnexpectedState,
 }

@@ -100,7 +111,23 @@ pub struct SafekeeperData {
    pub availability_zone_id: String,
 }

-#[serde_with::serde_as]
+/// For ID fields, the Console API does not always return a value or null.  It will
+/// sometimes return an empty string.  Our native Id type does not consider this acceptable
+/// (nor should it), so we use a wrapper for talking to the Console API.
+fn from_nullable_id<'de, D>(deserializer: D) -> Result<TenantId, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let id_str = String::deserialize(deserializer)?;
+    if id_str.is_empty() {
+        // This is a bogus value, but for the purposes of the scrubber all that
+        // matters is that it doesn't collide with any real IDs.
+        Ok(TenantId::from([0u8; 16]))
+    } else {
+        TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
+    }
+}
+
 #[derive(Debug, Clone, serde::Deserialize)]
 pub struct ProjectData {
    pub id: ProjectId,
@@ -109,7 +136,7 @@ pub struct ProjectData {
    pub platform_id: String,
    pub user_id: String,
    pub pageserver_id: u64,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
+    #[serde(deserialize_with = "from_nullable_id")]
    pub tenant: TenantId,
    pub safekeepers: Vec<SafekeeperData>,
    pub deleted: bool,
@@ -148,11 +175,27 @@ pub struct BranchData {
    pub written_size: Option<u64>,
 }

+pub trait MaybeDeleted {
+    fn is_deleted(&self) -> bool;
+}
+
+impl MaybeDeleted for ProjectData {
+    fn is_deleted(&self) -> bool {
+        self.deleted
+    }
+}
+
+impl MaybeDeleted for BranchData {
+    fn is_deleted(&self) -> bool {
+        self.deleted
+    }
+}
+
 impl CloudAdminApiClient {
-    pub fn new(token: String, base_url: Url) -> Self {
+    pub fn new(config: ConsoleConfig) -> Self {
        Self {
-            token,
-            base_url,
+            token: config.token,
+            base_url: config.base_url,
            request_limiter: Semaphore::new(200),
            http_client: Client::new(), // TODO timeout configs at least
        }
@@ -208,6 +251,81 @@ impl CloudAdminApiClient {
        }
    }

+    pub async fn list_projects(&self, region_id: String) -> Result<Vec<ProjectData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let mut pagination_offset = 0;
+        const PAGINATION_LIMIT: usize = 512;
+        let mut result: Vec<ProjectData> = Vec::with_capacity(PAGINATION_LIMIT);
+        loop {
+            let response = self
+                .http_client
+                .get(self.append_url("/projects"))
+                .query(&[
+                    ("show_deleted", "false".to_string()),
+                    ("limit", format!("{PAGINATION_LIMIT}")),
+                    ("offset", format!("{pagination_offset}")),
+                ])
+                .header(header::ACCEPT, "application/json")
+                .bearer_auth(&self.token)
+                .send()
+                .await
+                .map_err(|e| {
+                    Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::RequestSend(e),
+                    )
+                })?;
+
+            match response.status() {
+                StatusCode::OK => {}
+                StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => {
+                    tokio::time::sleep(Duration::from_millis(500)).await;
+                    continue;
+                }
+                status => {
+                    return Err(Error::new(
+                        "List active projects".to_string(),
+                        ErrorKind::ResponseStatus(response.status()),
+                    ))
+                }
+            }
+
+            let response_bytes = response.bytes().await.map_err(|e| {
+                Error::new("List active projects".to_string(), ErrorKind::BodyRead(e))
+            })?;
+
+            let decode_result =
+                serde_json::from_slice::<AdminApiResponse<Vec<ProjectData>>>(&response_bytes);
+
+            let mut response = match decode_result {
+                Ok(r) => r,
+                Err(decode) => {
+                    tracing::error!(
+                        "Failed to decode response body: {}\n{}",
+                        decode,
+                        String::from_utf8(response_bytes.to_vec()).unwrap()
+                    );
+                    panic!("we out");
+                }
+            };
+
+            pagination_offset += response.data.len();
+
+            result.extend(response.data.drain(..).filter(|t| t.region_id == region_id));
+
+            if pagination_offset >= response.total.unwrap_or(0) {
+                break;
+            }
+        }
+
+        Ok(result)
+    }
+
    pub async fn find_timeline_branch(
        &self,
        timeline_id: TimelineId,
--- a/s3_scrubber/src/delete_batch_producer.rs
+++ b/s3_scrubber/src/delete_batch_producer.rs
@@ -1,354 +0,0 @@
-mod tenant_batch;
-mod timeline_batch;
-
-use std::future::Future;
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::Duration;
-
-use anyhow::Context;
-use aws_sdk_s3::Client;
-use either::Either;
-use tokio::sync::mpsc::UnboundedReceiver;
-use tokio::sync::Mutex;
-use tokio::task::{JoinHandle, JoinSet};
-use tracing::{error, info, info_span, Instrument};
-
-use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
-use crate::{list_objects_with_retries, RootTarget, S3Target, TraversingDepth, MAX_RETRIES};
-use utils::id::{TenantId, TenantTimelineId};
-
-/// Typical tenant to remove contains 1 layer and 1 index_part.json blobs
-/// Also, there are some non-standard tenants to remove, having more layers.
-/// delete_objects request allows up to 1000 keys, so be on a safe side and allow most
-/// batch processing tasks to do 1 delete objects request only.
-///
-/// Every batch item will be additionally S3 LS'ed later, so keep the batch size
-/// even lower to allow multiple concurrent tasks do the LS requests.
-const BATCH_SIZE: usize = 100;
-
-pub struct DeleteBatchProducer {
-    delete_tenants_sender_task: JoinHandle<anyhow::Result<ProcessedS3List<TenantId, ProjectData>>>,
-    delete_timelines_sender_task:
-        JoinHandle<anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>>>,
-    delete_batch_creator_task: JoinHandle<()>,
-    delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
-}
-
-pub struct DeleteProducerStats {
-    pub tenant_stats: ProcessedS3List<TenantId, ProjectData>,
-    pub timeline_stats: Option<ProcessedS3List<TenantTimelineId, BranchData>>,
-}
-
-impl DeleteProducerStats {
-    pub fn tenants_checked(&self) -> usize {
-        self.tenant_stats.entries_total
-    }
-
-    pub fn active_tenants(&self) -> usize {
-        self.tenant_stats.active_entries.len()
-    }
-
-    pub fn timelines_checked(&self) -> usize {
-        self.timeline_stats
-            .as_ref()
-            .map(|stats| stats.entries_total)
-            .unwrap_or(0)
-    }
-}
-
-#[derive(Debug, Default, Clone)]
-pub struct DeleteBatch {
-    pub tenants: Vec<TenantId>,
-    pub timelines: Vec<TenantTimelineId>,
-}
-
-impl DeleteBatch {
-    pub fn merge(&mut self, other: Self) {
-        self.tenants.extend(other.tenants);
-        self.timelines.extend(other.timelines);
-    }
-
-    pub fn len(&self) -> usize {
-        self.tenants.len() + self.timelines.len()
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-}
-
-impl DeleteBatchProducer {
-    pub fn start(
-        admin_client: Arc<CloudAdminApiClient>,
-        s3_client: Arc<Client>,
-        s3_root_target: RootTarget,
-        traversing_depth: TraversingDepth,
-    ) -> Self {
-        let (delete_elements_sender, mut delete_elements_receiver) =
-            tokio::sync::mpsc::unbounded_channel();
-        let delete_elements_sender = Arc::new(delete_elements_sender);
-        let admin_client = Arc::new(admin_client);
-
-        let (projects_to_check_sender, mut projects_to_check_receiver) =
-            tokio::sync::mpsc::unbounded_channel();
-        let delete_tenants_root_target = s3_root_target.clone();
-        let delete_tenants_client = Arc::clone(&s3_client);
-        let delete_tenants_admin_client = Arc::clone(&admin_client);
-        let delete_sender = Arc::clone(&delete_elements_sender);
-        let delete_tenants_sender_task = tokio::spawn(
-            async move {
-                tenant_batch::schedule_cleanup_deleted_tenants(
-                    &delete_tenants_root_target,
-                    &delete_tenants_client,
-                    &delete_tenants_admin_client,
-                    projects_to_check_sender,
-                    delete_sender,
-                    traversing_depth,
-                )
-                .await
-            }
-            .instrument(info_span!("delete_tenants_sender")),
-        );
-        let delete_timelines_sender_task = tokio::spawn(async move {
-            timeline_batch::schedule_cleanup_deleted_timelines(
-                &s3_root_target,
-                &s3_client,
-                &admin_client,
-                &mut projects_to_check_receiver,
-                delete_elements_sender,
-            )
-            .in_current_span()
-            .await
-        });
-
-        let (delete_batch_sender, delete_batch_receiver) = tokio::sync::mpsc::unbounded_channel();
-        let delete_batch_creator_task = tokio::spawn(
-            async move {
-                'outer: loop {
-                    let mut delete_batch = DeleteBatch::default();
-                    while delete_batch.len() < BATCH_SIZE {
-                        match delete_elements_receiver.recv().await {
-                            Some(new_task) => match new_task {
-                                Either::Left(tenant_id) => delete_batch.tenants.push(tenant_id),
-                                Either::Right(timeline_id) => {
-                                    delete_batch.timelines.push(timeline_id)
-                                }
-                            },
-                            None => {
-                                info!("Task finished: sender dropped");
-                                delete_batch_sender.send(delete_batch).ok();
-                                break 'outer;
-                            }
-                        }
-                    }
-
-                    if !delete_batch.is_empty() {
-                        delete_batch_sender.send(delete_batch).ok();
-                    }
-                }
-            }
-            .instrument(info_span!("delete batch creator")),
-        );
-
-        Self {
-            delete_tenants_sender_task,
-            delete_timelines_sender_task,
-            delete_batch_creator_task,
-            delete_batch_receiver: Arc::new(Mutex::new(delete_batch_receiver)),
-        }
-    }
-
-    pub fn subscribe(&self) -> Arc<Mutex<UnboundedReceiver<DeleteBatch>>> {
-        self.delete_batch_receiver.clone()
-    }
-
-    pub async fn join(self) -> anyhow::Result<DeleteProducerStats> {
-        let (delete_tenants_task_result, delete_timelines_task_result, batch_task_result) = tokio::join!(
-            self.delete_tenants_sender_task,
-            self.delete_timelines_sender_task,
-            self.delete_batch_creator_task,
-        );
-
-        let tenant_stats = match delete_tenants_task_result {
-            Ok(Ok(stats)) => stats,
-            Ok(Err(tenant_deletion_error)) => return Err(tenant_deletion_error),
-            Err(join_error) => {
-                anyhow::bail!("Failed to join the delete tenant producing task: {join_error}")
-            }
-        };
-
-        let timeline_stats = match delete_timelines_task_result {
-            Ok(Ok(stats)) => Some(stats),
-            Ok(Err(timeline_deletion_error)) => return Err(timeline_deletion_error),
-            Err(join_error) => {
-                anyhow::bail!("Failed to join the delete timeline producing task: {join_error}")
-            }
-        };
-
-        match batch_task_result {
-            Ok(()) => (),
-            Err(join_error) => anyhow::bail!("Failed to join the batch forming task: {join_error}"),
-        };
-
-        Ok(DeleteProducerStats {
-            tenant_stats,
-            timeline_stats,
-        })
-    }
-}
-
-pub struct ProcessedS3List<I, A> {
-    pub entries_total: usize,
-    pub entries_to_delete: Vec<I>,
-    pub active_entries: Vec<A>,
-}
-
-impl<I, A> Default for ProcessedS3List<I, A> {
-    fn default() -> Self {
-        Self {
-            entries_total: 0,
-            entries_to_delete: Vec::new(),
-            active_entries: Vec::new(),
-        }
-    }
-}
-
-impl<I, A> ProcessedS3List<I, A> {
-    fn merge(&mut self, other: Self) {
-        self.entries_total += other.entries_total;
-        self.entries_to_delete.extend(other.entries_to_delete);
-        self.active_entries.extend(other.active_entries);
-    }
-
-    fn change_ids<NewI>(self, transform: impl Fn(I) -> NewI) -> ProcessedS3List<NewI, A> {
-        ProcessedS3List {
-            entries_total: self.entries_total,
-            entries_to_delete: self.entries_to_delete.into_iter().map(transform).collect(),
-            active_entries: self.active_entries,
-        }
-    }
-}
-
-async fn process_s3_target_recursively<F, Fut, I, E, A>(
-    s3_client: &Client,
-    target: &S3Target,
-    find_active_and_deleted_entries: F,
-) -> anyhow::Result<ProcessedS3List<I, A>>
-where
-    I: FromStr<Err = E> + Send + Sync,
-    E: Send + Sync + std::error::Error + 'static,
-    F: FnOnce(Vec<I>) -> Fut + Clone,
-    Fut: Future<Output = anyhow::Result<ProcessedS3List<I, A>>>,
-{
-    let mut continuation_token = None;
-    let mut total_entries = ProcessedS3List::default();
-
-    loop {
-        let fetch_response =
-            list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
-
-        let new_entry_ids = fetch_response
-            .common_prefixes()
-            .unwrap_or_default()
-            .iter()
-            .filter_map(|prefix| prefix.prefix())
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .strip_prefix(&target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            })
-            .collect::<anyhow::Result<Vec<I>>>()
-            .context("list and parse bucket's entry ids")?;
-
-        total_entries.merge(
-            (find_active_and_deleted_entries.clone())(new_entry_ids)
-                .await
-                .context("filter active and deleted entry ids")?,
-        );
-
-        match fetch_response.next_continuation_token {
-            Some(new_token) => continuation_token = Some(new_token),
-            None => break,
-        }
-    }
-
-    Ok(total_entries)
-}
-
-enum FetchResult<A> {
-    Found(A),
-    Deleted,
-    Absent,
-}
-
-async fn split_to_active_and_deleted_entries<I, A, F, Fut>(
-    new_entry_ids: Vec<I>,
-    find_active_entry: F,
-) -> anyhow::Result<ProcessedS3List<I, A>>
-where
-    I: std::fmt::Display + Send + Sync + 'static + Copy,
-    A: Send + 'static,
-    F: FnOnce(I) -> Fut + Send + Sync + 'static + Clone,
-    Fut: Future<Output = anyhow::Result<FetchResult<A>>> + Send,
-{
-    let entries_total = new_entry_ids.len();
-    let mut check_tasks = JoinSet::new();
-    let mut active_entries = Vec::with_capacity(entries_total);
-    let mut entries_to_delete = Vec::with_capacity(entries_total);
-
-    for new_entry_id in new_entry_ids {
-        let check_closure = find_active_entry.clone();
-        check_tasks.spawn(
-            async move {
-                (
-                    new_entry_id,
-                    async {
-                        for _ in 0..MAX_RETRIES {
-                            let closure_clone = check_closure.clone();
-                            match closure_clone(new_entry_id).await {
-                                Ok(active_entry) => return Ok(active_entry),
-                                Err(e) => {
-                                    error!("find active entry admin API call failed: {e}");
-                                    tokio::time::sleep(Duration::from_secs(1)).await;
-                                }
-                            }
-                        }
-
-                        anyhow::bail!("Failed to check entry {new_entry_id} {MAX_RETRIES} times")
-                    }
-                    .await,
-                )
-            }
-            .instrument(info_span!("filter_active_entries")),
-        );
-    }
-
-    while let Some(task_result) = check_tasks.join_next().await {
-        let (entry_id, entry_data_fetch_result) = task_result.context("task join")?;
-        match entry_data_fetch_result.context("entry data fetch")? {
-            FetchResult::Found(active_entry) => {
-                info!("Entry {entry_id} is alive, cannot delete");
-                active_entries.push(active_entry);
-            }
-            FetchResult::Deleted => {
-                info!("Entry {entry_id} deleted in the admin data, can safely delete");
-                entries_to_delete.push(entry_id);
-            }
-            FetchResult::Absent => {
-                info!("Entry {entry_id} absent in the admin data, can safely delete");
-                entries_to_delete.push(entry_id);
-            }
-        }
-    }
-    Ok(ProcessedS3List {
-        entries_total,
-        entries_to_delete,
-        active_entries,
-    })
-}
--- a/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
@@ -1,87 +0,0 @@
-use std::sync::Arc;
-
-use anyhow::Context;
-use aws_sdk_s3::Client;
-use either::Either;
-use tokio::sync::mpsc::UnboundedSender;
-use tracing::info;
-
-use crate::cloud_admin_api::{CloudAdminApiClient, ProjectData};
-use crate::delete_batch_producer::FetchResult;
-use crate::{RootTarget, TraversingDepth};
-use utils::id::{TenantId, TenantTimelineId};
-
-use super::ProcessedS3List;
-
-pub async fn schedule_cleanup_deleted_tenants(
-    s3_root_target: &RootTarget,
-    s3_client: &Arc<Client>,
-    admin_client: &Arc<CloudAdminApiClient>,
-    projects_to_check_sender: UnboundedSender<ProjectData>,
-    delete_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
-    traversing_depth: TraversingDepth,
-) -> anyhow::Result<ProcessedS3List<TenantId, ProjectData>> {
-    info!(
-        "Starting to list the bucket from root {}",
-        s3_root_target.bucket_name()
-    );
-    s3_client
-        .head_bucket()
-        .bucket(s3_root_target.bucket_name())
-        .send()
-        .await
-        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
-
-    let check_client = Arc::clone(admin_client);
-    let tenant_stats = super::process_s3_target_recursively(
-        s3_client,
-        s3_root_target.tenants_root(),
-        |s3_tenants| async move {
-            let another_client = Arc::clone(&check_client);
-            super::split_to_active_and_deleted_entries(s3_tenants, move |tenant_id| async move {
-                let project_data = another_client
-                    .find_tenant_project(tenant_id)
-                    .await
-                    .with_context(|| format!("Tenant {tenant_id} project admin check"))?;
-
-                Ok(if let Some(console_project) = project_data {
-                    if console_project.deleted {
-                        delete_sender.send(Either::Left(tenant_id)).ok();
-                        FetchResult::Deleted
-                    } else {
-                        if traversing_depth == TraversingDepth::Timeline {
-                            projects_to_check_sender.send(console_project.clone()).ok();
-                        }
-                        FetchResult::Found(console_project)
-                    }
-                } else {
-                    delete_sender.send(Either::Left(tenant_id)).ok();
-                    FetchResult::Absent
-                })
-            })
-            .await
-        },
-    )
-    .await
-    .context("tenant batch processing")?;
-
-    info!(
-        "Among {} tenants, found {} tenants to delete and {} active ones",
-        tenant_stats.entries_total,
-        tenant_stats.entries_to_delete.len(),
-        tenant_stats.active_entries.len(),
-    );
-
-    let tenant_stats = match traversing_depth {
-        TraversingDepth::Tenant => {
-            info!("Finished listing the bucket for tenants only");
-            tenant_stats
-        }
-        TraversingDepth::Timeline => {
-            info!("Finished listing the bucket for tenants and sent {} active tenants to check for timelines", tenant_stats.active_entries.len());
-            tenant_stats
-        }
-    };
-
-    Ok(tenant_stats)
-}
--- a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
@@ -1,102 +0,0 @@
-use std::sync::Arc;
-
-use anyhow::Context;
-use aws_sdk_s3::Client;
-use either::Either;
-use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
-use tracing::{info, info_span, Instrument};
-
-use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
-use crate::delete_batch_producer::{FetchResult, ProcessedS3List};
-use crate::RootTarget;
-use utils::id::{TenantId, TenantTimelineId};
-
-pub async fn schedule_cleanup_deleted_timelines(
-    s3_root_target: &RootTarget,
-    s3_client: &Arc<Client>,
-    admin_client: &Arc<CloudAdminApiClient>,
-    projects_to_check_receiver: &mut UnboundedReceiver<ProjectData>,
-    delete_elements_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
-) -> anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>> {
-    info!(
-        "Starting to list the bucket from root {}",
-        s3_root_target.bucket_name()
-    );
-    s3_client
-        .head_bucket()
-        .bucket(s3_root_target.bucket_name())
-        .send()
-        .await
-        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
-
-    let mut timeline_stats = ProcessedS3List::default();
-    while let Some(project_to_check) = projects_to_check_receiver.recv().await {
-        let check_client = Arc::clone(admin_client);
-
-        let check_s3_client = Arc::clone(s3_client);
-
-        let check_delete_sender = Arc::clone(&delete_elements_sender);
-
-        let check_root = s3_root_target.clone();
-
-        let new_stats = async move {
-            let tenant_id_to_check = project_to_check.tenant;
-            let check_target = check_root.timelines_root(&tenant_id_to_check);
-            let stats = super::process_s3_target_recursively(
-                &check_s3_client,
-                &check_target,
-                |s3_timelines| async move {
-                    let another_client = check_client.clone();
-                    super::split_to_active_and_deleted_entries(
-                        s3_timelines,
-                        move |timeline_id| async move {
-                            let console_branch = another_client
-                                .find_timeline_branch(timeline_id)
-                                .await
-                                .map_err(|e| {
-                                    anyhow::anyhow!(
-                                        "Timeline {timeline_id} branch admin check: {e}"
-                                    )
-                                })?;
-
-                            let id = TenantTimelineId::new(tenant_id_to_check, timeline_id);
-                            Ok(match console_branch {
-                                Some(console_branch) => {
-                                    if console_branch.deleted {
-                                        check_delete_sender.send(Either::Right(id)).ok();
-                                        FetchResult::Deleted
-                                    } else {
-                                        FetchResult::Found(console_branch)
-                                    }
-                                }
-                                None => {
-                                    check_delete_sender.send(Either::Right(id)).ok();
-                                    FetchResult::Absent
-                                }
-                            })
-                        },
-                    )
-                    .await
-                },
-            )
-            .await
-            .with_context(|| format!("tenant {tenant_id_to_check} timeline batch processing"))?
-            .change_ids(|timeline_id| TenantTimelineId::new(tenant_id_to_check, timeline_id));
-
-            Ok::<_, anyhow::Error>(stats)
-        }
-        .instrument(info_span!("delete_timelines_sender", tenant = %project_to_check.tenant))
-        .await?;
-
-        timeline_stats.merge(new_stats);
-    }
-
-    info!(
-        "Among {} timelines, found {} timelines to delete and {} active ones",
-        timeline_stats.entries_total,
-        timeline_stats.entries_to_delete.len(),
-        timeline_stats.active_entries.len(),
-    );
-
-    Ok(timeline_stats)
-}
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -0,0 +1,419 @@
+//! Functionality for finding and purging garbage, as in "garbage collection".  Garbage means
+//! S3 objects which are either not referenced by any metadata, or are referenced by a
+//! control plane tenant/timeline in a deleted state.
+
+use std::{collections::HashMap, sync::Arc};
+
+use anyhow::Context;
+use aws_sdk_s3::{
+    types::{Delete, ObjectIdentifier},
+    Client,
+};
+use futures_util::{pin_mut, TryStreamExt};
+use serde::{Deserialize, Serialize};
+use tokio_stream::StreamExt;
+use utils::id::{TenantId, TenantTimelineId};
+
+use crate::{
+    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
+    init_remote,
+    metadata_stream::{stream_listing, stream_tenant_timelines, stream_tenants},
+    BucketConfig, ConsoleConfig, NodeKind, RootTarget, TraversingDepth,
+};
+
+#[derive(Serialize, Deserialize, Debug)]
+enum GarbageReason {
+    DeletedInConsole,
+    MissingInConsole,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+enum GarbageEntity {
+    Tenant(TenantId),
+    Timeline(TenantTimelineId),
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct GarbageItem {
+    entity: GarbageEntity,
+    reason: GarbageReason,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct GarbageList {
+    /// Remember what NodeKind we were finding garbage for, so that we can
+    /// purge the list without re-stating it.
+    node_kind: NodeKind,
+
+    /// Embed the identity of the bucket, so that we do not risk executing
+    /// the wrong list against the wrong bucket, and so that the user does not have
+    /// to re-state the bucket details when purging.
+    bucket_config: BucketConfig,
+
+    items: Vec<GarbageItem>,
+
+    /// Advisory information to enable consumers to do a validation that if we
+    /// see garbage, we saw some active tenants too.  This protects against classes of bugs
+    /// in the scrubber that might otherwise generate a "deleted all" result.
+    active_tenant_count: usize,
+}
+
+impl GarbageList {
+    fn new(node_kind: NodeKind, bucket_config: BucketConfig) -> Self {
+        Self {
+            items: Vec::new(),
+            active_tenant_count: 0,
+            node_kind,
+            bucket_config,
+        }
+    }
+
+    /// Return true if appended, false if not.  False means the result was not garbage.
+    fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
+    where
+        T: MaybeDeleted,
+    {
+        match result {
+            Some(result_item) if result_item.is_deleted() => {
+                self.items.push(GarbageItem {
+                    entity,
+                    reason: GarbageReason::DeletedInConsole,
+                });
+                true
+            }
+            Some(_) => false,
+            None => {
+                self.items.push(GarbageItem {
+                    entity,
+                    reason: GarbageReason::MissingInConsole,
+                });
+                true
+            }
+        }
+    }
+}
+
+pub async fn find_garbage(
+    bucket_config: BucketConfig,
+    console_config: ConsoleConfig,
+    depth: TraversingDepth,
+    node_kind: NodeKind,
+    output_path: String,
+) -> anyhow::Result<()> {
+    let garbage = find_garbage_inner(bucket_config, console_config, depth, node_kind).await?;
+    let serialized = serde_json::to_vec_pretty(&garbage)?;
+
+    tokio::fs::write(&output_path, &serialized).await?;
+
+    tracing::info!("Wrote garbage report to {output_path}");
+
+    Ok(())
+}
+
+// How many concurrent S3 operations to issue (approximately): this is the concurrency
+// for things like listing the timelines within tenant prefixes.
+const S3_CONCURRENCY: usize = 32;
+
+// How many concurrent API requests to make to the console API.
+const CONSOLE_CONCURRENCY: usize = 128;
+
+async fn find_garbage_inner(
+    bucket_config: BucketConfig,
+    console_config: ConsoleConfig,
+    depth: TraversingDepth,
+    node_kind: NodeKind,
+) -> anyhow::Result<GarbageList> {
+    // Construct clients for S3 and for Console API
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?;
+    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
+
+    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
+    // to issue O(N) console API requests.
+    let console_projects: HashMap<TenantId, ProjectData> = cloud_admin_api_client
+        // FIXME: we can't just assume that all console's region ids are aws-<something>.  This hack
+        // will go away when we are talking to Control Plane APIs, which are per-region.
+        .list_projects(format!("aws-{}", bucket_config.region))
+        .await?
+        .into_iter()
+        .map(|t| (t.tenant, t))
+        .collect();
+    tracing::info!(
+        "Loaded {} console projects tenant IDs",
+        console_projects.len()
+    );
+
+    // Enumerate Tenants in S3, and check if each one exists in Console
+    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
+    let tenants = stream_tenants(&s3_client, &target);
+    let tenants_checked = tenants.map_ok(|t| {
+        let api_client = cloud_admin_api_client.clone();
+        let console_projects = &console_projects;
+        async move {
+            match console_projects.get(&t) {
+                Some(project_data) => Ok((t, Some(project_data.clone()))),
+                None => api_client
+                    .find_tenant_project(t)
+                    .await
+                    .map_err(|e| anyhow::anyhow!(e))
+                    .map(|r| (t, r)),
+            }
+        }
+    });
+    let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+
+    // Process the results of Tenant checks.  If a Tenant is garbage, it goes into
+    // the `GarbageList`.  Else it goes into `active_tenants` for more detailed timeline
+    // checks if they are enabled by the `depth` parameter.
+    pin_mut!(tenants_checked);
+    let mut garbage = GarbageList::new(node_kind, bucket_config);
+    let mut active_tenants: Vec<TenantId> = vec![];
+    let mut counter = 0;
+    while let Some(result) = tenants_checked.next().await {
+        let (tenant_id, console_result) = result?;
+
+        // Paranoia check
+        if let Some(project) = &console_result {
+            assert!(project.tenant == tenant_id);
+        }
+
+        if garbage.maybe_append(GarbageEntity::Tenant(tenant_id), console_result) {
+            tracing::debug!("Tenant {tenant_id} is garbage");
+        } else {
+            tracing::debug!("Tenant {tenant_id} is active");
+            active_tenants.push(tenant_id);
+        }
+
+        counter += 1;
+        if counter % 1000 == 0 {
+            tracing::info!(
+                "Progress: {counter} tenants checked, {} active, {} garbage",
+                active_tenants.len(),
+                garbage.items.len()
+            );
+        }
+    }
+
+    tracing::info!(
+        "Found {}/{} garbage tenants",
+        garbage.items.len(),
+        garbage.items.len() + active_tenants.len()
+    );
+
+    // If we are only checking tenant-deep, we are done.  Otherwise we must
+    // proceed to check the individual timelines of the active tenants.
+    if depth == TraversingDepth::Tenant {
+        return Ok(garbage);
+    }
+
+    tracing::info!(
+        "Checking timelines for {} active tenants",
+        active_tenants.len(),
+    );
+
+    // Construct a stream of all timelines within active tenants
+    let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
+    let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
+    let timelines = timelines.try_flatten();
+
+    // For all timelines within active tenants, call into console API to check their existence
+    let timelines_checked = timelines.map_ok(|ttid| {
+        let api_client = cloud_admin_api_client.clone();
+        async move {
+            api_client
+                .find_timeline_branch(ttid.timeline_id)
+                .await
+                .map_err(|e| anyhow::anyhow!(e))
+                .map(|r| (ttid, r))
+        }
+    });
+    let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY);
+
+    // Update the GarbageList with any timelines which appear not to exist.
+    pin_mut!(timelines_checked);
+    while let Some(result) = timelines_checked.next().await {
+        let (ttid, console_result) = result?;
+        if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) {
+            tracing::debug!("Timeline {ttid} is garbage");
+        } else {
+            tracing::debug!("Timeline {ttid} is active");
+        }
+    }
+
+    Ok(garbage)
+}
+
+#[derive(clap::ValueEnum, Debug, Clone)]
+pub enum PurgeMode {
+    /// The safest mode: only delete tenants that were explicitly reported as deleted
+    /// by Console API.
+    DeletedOnly,
+
+    /// Delete all garbage tenants, including those which are only presumed to be deleted,
+    /// because the Console API could not find them.
+    DeletedAndMissing,
+}
+
+impl std::fmt::Display for PurgeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            PurgeMode::DeletedOnly => write!(f, "deleted-only"),
+            PurgeMode::DeletedAndMissing => write!(f, "deleted-and-missing"),
+        }
+    }
+}
+
+pub async fn get_tenant_objects(
+    s3_client: &Arc<Client>,
+    target: RootTarget,
+    tenant_id: TenantId,
+) -> anyhow::Result<Vec<ObjectIdentifier>> {
+    tracing::debug!("Listing objects in tenant {tenant_id}");
+    // TODO: apply extra validation based on object modification time.  Don't purge
+    // tenants where any timeline's index_part.json has been touched recently.
+
+    let mut tenant_root = target.tenant_root(&tenant_id);
+
+    // Remove delimiter, so that object listing lists all keys in the prefix and not just
+    // common prefixes.
+    tenant_root.delimiter = String::new();
+
+    let key_stream = stream_listing(s3_client, &tenant_root);
+    key_stream.try_collect().await
+}
+
+pub async fn get_timeline_objects(
+    s3_client: &Arc<Client>,
+    target: RootTarget,
+    ttid: TenantTimelineId,
+) -> anyhow::Result<Vec<ObjectIdentifier>> {
+    tracing::debug!("Listing objects in timeline {ttid}");
+    let mut timeline_root = target.timeline_root(&ttid);
+
+    // TODO: apply extra validation based on object modification time.  Don't purge
+    // timelines whose index_part.json has been touched recently.
+
+    // Remove delimiter, so that object listing lists all keys in the prefix and not just
+    // common prefixes.
+    timeline_root.delimiter = String::new();
+    let key_stream = stream_listing(s3_client, &timeline_root);
+
+    key_stream.try_collect().await
+}
+
+const MAX_KEYS_PER_DELETE: usize = 1000;
+
+/// Drain a buffer of keys into DeleteObjects requests
+async fn do_delete(
+    s3_client: &Arc<Client>,
+    bucket_name: &str,
+    keys: &mut Vec<ObjectIdentifier>,
+    dry_run: bool,
+    drain: bool,
+) -> anyhow::Result<()> {
+    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
+        let request_keys =
+            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+        if dry_run {
+            tracing::info!("Dry-run deletion of objects: ");
+            for k in request_keys {
+                tracing::info!("  {k:?}");
+            }
+        } else {
+            let delete_request = s3_client
+                .delete_objects()
+                .bucket(bucket_name)
+                .delete(Delete::builder().set_objects(Some(request_keys)).build());
+            delete_request
+                .send()
+                .await
+                .context("DeleteObjects request")?;
+        }
+    }
+
+    Ok(())
+}
+
+pub async fn purge_garbage(
+    input_path: String,
+    mode: PurgeMode,
+    dry_run: bool,
+) -> anyhow::Result<()> {
+    let list_bytes = tokio::fs::read(&input_path).await?;
+    let garbage_list = serde_json::from_slice::<GarbageList>(&list_bytes)?;
+    tracing::info!(
+        "Loaded {} items in garbage list from {}",
+        garbage_list.items.len(),
+        input_path
+    );
+
+    let (s3_client, target) =
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?;
+
+    // Sanity checks on the incoming list
+    if garbage_list.active_tenant_count == 0 {
+        anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants");
+    }
+
+    let filtered_items = garbage_list
+        .items
+        .iter()
+        .filter(|i| match (&mode, &i.reason) {
+            (PurgeMode::DeletedAndMissing, _) => true,
+            (PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
+            (PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
+        });
+
+    tracing::info!(
+        "Filtered down to {} garbage items based on mode {}",
+        garbage_list.items.len(),
+        mode
+    );
+
+    let items = tokio_stream::iter(filtered_items.map(Ok));
+    let get_objects_results = items.map_ok(|i| {
+        let s3_client = s3_client.clone();
+        let target = target.clone();
+        async move {
+            match i.entity {
+                GarbageEntity::Tenant(tenant_id) => {
+                    get_tenant_objects(&s3_client, target, tenant_id).await
+                }
+                GarbageEntity::Timeline(ttid) => {
+                    get_timeline_objects(&s3_client, target, ttid).await
+                }
+            }
+        }
+    });
+    let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY);
+
+    pin_mut!(get_objects_results);
+    let mut objects_to_delete = Vec::new();
+    while let Some(result) = get_objects_results.next().await {
+        let mut object_list = result?;
+        objects_to_delete.append(&mut object_list);
+        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
+            do_delete(
+                &s3_client,
+                &garbage_list.bucket_config.bucket,
+                &mut objects_to_delete,
+                dry_run,
+                false,
+            )
+            .await?;
+        }
+    }
+
+    do_delete(
+        &s3_client,
+        &garbage_list.bucket_config.bucket,
+        &mut objects_to_delete,
+        dry_run,
+        true,
+    )
+    .await?;
+
+    tracing::info!("Fell through");
+
+    Ok(())
+}
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,12 +1,12 @@
 pub mod checks;
 pub mod cloud_admin_api;
-pub mod delete_batch_producer;
+pub mod garbage;
 pub mod metadata_stream;
-mod s3_deletion;
 pub mod scan_metadata;

 use std::env;
 use std::fmt::Display;
+use std::sync::Arc;
 use std::time::Duration;

 use anyhow::Context;
@@ -17,8 +17,10 @@ use aws_config::sso::SsoCredentialsProvider;
 use aws_sdk_s3::config::Region;
 use aws_sdk_s3::{Client, Config};

+use clap::ValueEnum;
+use pageserver::tenant::TENANTS_SEGMENT_NAME;
 use reqwest::Url;
-pub use s3_deletion::S3Deleter;
+use serde::{Deserialize, Serialize};
 use std::io::IsTerminal;
 use tokio::io::AsyncReadExt;
 use tracing::error;
@@ -29,8 +31,6 @@ use utils::id::{TenantId, TenantTimelineId};
 const MAX_RETRIES: usize = 20;
 const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";

-pub const CLI_NAME: &str = "s3-scrubber";
-
 #[derive(Debug, Clone)]
 pub struct S3Target {
    pub bucket_name: String,
@@ -53,6 +53,27 @@ impl Display for TraversingDepth {
    }
 }

+#[derive(ValueEnum, Clone, Copy, Eq, PartialEq, Debug, Serialize, Deserialize)]
+pub enum NodeKind {
+    Safekeeper,
+    Pageserver,
+}
+
+impl NodeKind {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Self::Safekeeper => "safekeeper",
+            Self::Pageserver => "pageserver",
+        }
+    }
+}
+
+impl Display for NodeKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
 impl S3Target {
    pub fn with_sub_segment(&self, new_segment: &str) -> Self {
        let mut new_self = self.clone();
@@ -108,6 +129,7 @@ impl RootTarget {
    }
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct BucketConfig {
    pub region: String,
    pub bucket: String,
@@ -143,31 +165,21 @@ impl BucketConfig {
 }

 pub struct ConsoleConfig {
-    pub admin_api_url: Url,
+    pub token: String,
+    pub base_url: Url,
 }

 impl ConsoleConfig {
    pub fn from_env() -> anyhow::Result<Self> {
-        let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL")
+        let base_url: Url = env::var("CLOUD_ADMIN_API_URL")
            .context("'CLOUD_ADMIN_API_URL' param retrieval")?
            .parse()
            .context("'CLOUD_ADMIN_API_URL' param parsing")?;

-        Ok(Self { admin_api_url })
-    }
-}
+        let token = env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR)
+            .context("'CLOUD_ADMIN_API_TOKEN' environment variable fetch")?;

-pub fn get_cloud_admin_api_token_or_exit() -> String {
-    match env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) {
-        Ok(token) => token,
-        Err(env::VarError::NotPresent) => {
-            error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable is not present");
-            std::process::exit(1);
-        }
-        Err(env::VarError::NotUnicode(not_unicode_string)) => {
-            error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable's value is not a valid unicode string: {not_unicode_string:?}");
-            std::process::exit(1);
-        }
+        Ok(Self { base_url, token })
    }
 }

@@ -231,6 +243,29 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
    Client::from_conf(builder.build())
 }

+fn init_remote(
+    bucket_config: BucketConfig,
+    node_kind: NodeKind,
+) -> anyhow::Result<(Arc<Client>, RootTarget)> {
+    let bucket_region = Region::new(bucket_config.region);
+    let delimiter = "/".to_string();
+    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_root = match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(&delimiter),
+            delimiter,
+        }),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
+            delimiter,
+        }),
+    };
+
+    Ok((s3_client, s3_root))
+}
+
 async fn list_objects_with_retries(
    s3_client: &Client,
    s3_target: &S3Target,
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,20 +1,8 @@
-use std::collections::HashMap;
-use std::fmt::Display;
-use std::num::NonZeroUsize;
-use std::sync::Arc;
-
-use anyhow::Context;
-use aws_sdk_s3::config::Region;
-use s3_scrubber::cloud_admin_api::CloudAdminApiClient;
-use s3_scrubber::delete_batch_producer::DeleteBatchProducer;
+use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
-use s3_scrubber::{
-    checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig,
-    ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME,
-};
-use tracing::{info, warn};
+use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};

-use clap::{Parser, Subcommand, ValueEnum};
+use clap::{Parser, Subcommand};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -27,212 +15,45 @@ struct Cli {
    delete: bool,
 }

-#[derive(ValueEnum, Clone, Copy, Eq, PartialEq)]
-enum NodeKind {
-    Safekeeper,
-    Pageserver,
-}
-
-impl NodeKind {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Self::Safekeeper => "safekeeper",
-            Self::Pageserver => "pageserver",
-        }
-    }
-}
-
-impl Display for NodeKind {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
-#[derive(Subcommand)]
+#[derive(Subcommand, Debug)]
 enum Command {
-    Tidy {
+    FindGarbage {
        #[arg(short, long)]
        node_kind: NodeKind,
        #[arg(short, long, default_value_t=TraversingDepth::Tenant)]
        depth: TraversingDepth,
-        #[arg(short, long, default_value_t = false)]
-        skip_validation: bool,
+        #[arg(short, long, default_value_t = String::from("garbage.json"))]
+        output_path: String,
+    },
+    PurgeGarbage {
+        #[arg(short, long)]
+        input_path: String,
+        #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
+        mode: PurgeMode,
    },
    ScanMetadata {},
 }

-async fn tidy(
-    cli: &Cli,
-    bucket_config: BucketConfig,
-    console_config: ConsoleConfig,
-    node_kind: NodeKind,
-    depth: TraversingDepth,
-    skip_validation: bool,
-) -> anyhow::Result<()> {
-    let dry_run = !cli.delete;
-    let file_name = if dry_run {
-        format!(
-            "{}_{}_{}__dry.log",
-            CLI_NAME,
-            node_kind,
-            chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
-        )
-    } else {
-        format!(
-            "{}_{}_{}.log",
-            CLI_NAME,
-            node_kind,
-            chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
-        )
-    };
-
-    let _guard = init_logging(&file_name);
-
-    if dry_run {
-        info!("Dry run, not removing items for real");
-    } else {
-        warn!("Dry run disabled, removing bucket items for real");
-    }
-
-    info!("skip_validation={skip_validation}");
-
-    info!("Starting extra S3 removal in {bucket_config} for node kind '{node_kind}', traversing depth: {depth:?}");
-
-    info!("Starting extra tenant S3 removal in {bucket_config} for node kind '{node_kind}'");
-    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(
-        get_cloud_admin_api_token_or_exit(),
-        console_config.admin_api_url,
-    ));
-
-    let bucket_region = Region::new(bucket_config.region);
-    let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
-    let s3_root = match node_kind {
-        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: ["pageserver", "v1", "tenants", ""].join(&delimiter),
-            delimiter,
-        }),
-        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
-            bucket_name: bucket_config.bucket,
-            prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
-            delimiter,
-        }),
-    };
-
-    let delete_batch_producer = DeleteBatchProducer::start(
-        Arc::clone(&cloud_admin_api_client),
-        Arc::clone(&s3_client),
-        s3_root.clone(),
-        depth,
-    );
-
-    let s3_deleter = S3Deleter::new(
-        dry_run,
-        NonZeroUsize::new(15).unwrap(),
-        Arc::clone(&s3_client),
-        delete_batch_producer.subscribe(),
-        s3_root.clone(),
-    );
-
-    let (deleter_task_result, batch_producer_task_result) =
-        tokio::join!(s3_deleter.remove_all(), delete_batch_producer.join());
-
-    let deletion_stats = deleter_task_result.context("s3 deletion")?;
-    info!(
-        "Deleted {} tenants ({} keys) and {} timelines ({} keys) total. Dry run: {}",
-        deletion_stats.deleted_tenant_keys.len(),
-        deletion_stats.deleted_tenant_keys.values().sum::<usize>(),
-        deletion_stats.deleted_timeline_keys.len(),
-        deletion_stats.deleted_timeline_keys.values().sum::<usize>(),
-        dry_run,
-    );
-    info!(
-        "Total tenant deletion stats: {:?}",
-        deletion_stats
-            .deleted_tenant_keys
-            .into_iter()
-            .map(|(id, key)| (id.to_string(), key))
-            .collect::<HashMap<_, _>>()
-    );
-    info!(
-        "Total timeline deletion stats: {:?}",
-        deletion_stats
-            .deleted_timeline_keys
-            .into_iter()
-            .map(|(id, key)| (id.to_string(), key))
-            .collect::<HashMap<_, _>>()
-    );
-
-    let batch_producer_stats = batch_producer_task_result.context("delete batch producer join")?;
-    info!(
-        "Total bucket tenants listed: {}; for {} active tenants, timelines checked: {}",
-        batch_producer_stats.tenants_checked(),
-        batch_producer_stats.active_tenants(),
-        batch_producer_stats.timelines_checked()
-    );
-
-    if node_kind == NodeKind::Pageserver {
-        info!("node_kind != pageserver, finish without performing validation step");
-        return Ok(());
-    }
-
-    if skip_validation {
-        info!("--skip-validation is set, exiting");
-        return Ok(());
-    }
-
-    info!("validating active tenants and timelines for pageserver S3 data");
-
-    // TODO kb real stats for validation + better stats for every place: add and print `min`, `max`, `mean` values at least
-    let validation_stats = checks::validate_pageserver_active_tenant_and_timelines(
-        s3_client,
-        s3_root,
-        cloud_admin_api_client,
-        batch_producer_stats,
-    )
-    .await
-    .context("active tenant and timeline validation")?;
-    info!("Finished active tenant and timeline validation, correct timelines: {}, timeline validation errors: {}",
-        validation_stats.normal_timelines.len(), validation_stats.timelines_with_errors.len());
-    if !validation_stats.timelines_with_errors.is_empty() {
-        warn!(
-            "Validation errors: {:#?}",
-            validation_stats
-                .timelines_with_errors
-                .into_iter()
-                .map(|(id, errors)| (id.to_string(), format!("{errors:?}")))
-                .collect::<HashMap<_, _>>()
-        );
-    }
-
-    info!("Done");
-    Ok(())
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();

    let bucket_config = BucketConfig::from_env()?;

+    let command_log_name = match &cli.command {
+        Command::ScanMetadata { .. } => "scan",
+        Command::FindGarbage { .. } => "find-garbage",
+        Command::PurgeGarbage { .. } => "purge-garbage",
+    };
+    let _guard = init_logging(&format!(
+        "{}_{}_{}_{}.log",
+        std::env::args().next().unwrap(),
+        command_log_name,
+        bucket_config.bucket,
+        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
+    ));
+
    match cli.command {
-        Command::Tidy {
-            node_kind,
-            depth,
-            skip_validation,
-        } => {
-            let console_config = ConsoleConfig::from_env()?;
-            tidy(
-                &cli,
-                bucket_config,
-                console_config,
-                node_kind,
-                depth,
-                skip_validation,
-            )
-            .await
-        }
        Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
            Err(e) => {
                tracing::error!("Failed: {e}");
@@ -247,5 +68,16 @@ async fn main() -> anyhow::Result<()> {
                }
            }
        },
+        Command::FindGarbage {
+            node_kind,
+            depth,
+            output_path,
+        } => {
+            let console_config = ConsoleConfig::from_env()?;
+            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
+        }
+        Command::PurgeGarbage { input_path, mode } => {
+            purge_garbage(input_path, mode, !cli.delete).await
+        }
    }
 }
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -1,9 +1,9 @@
 use anyhow::Context;
 use async_stream::{stream, try_stream};
-use aws_sdk_s3::Client;
+use aws_sdk_s3::{types::ObjectIdentifier, Client};
 use tokio_stream::Stream;

-use crate::{list_objects_with_retries, RootTarget, TenantId};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId};
 use utils::id::{TenantTimelineId, TimelineId};

 /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
@@ -104,3 +104,34 @@ pub async fn stream_tenant_timelines<'a>(
        }
    })
 }
+
+pub(crate) fn stream_listing<'a>(
+    s3_client: &'a Client,
+    target: &'a S3Target,
+) -> impl Stream<Item = anyhow::Result<ObjectIdentifier>> + 'a {
+    try_stream! {
+        let mut continuation_token = None;
+        loop {
+            let fetch_response =
+                list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
+
+            if target.delimiter.is_empty() {
+                for object_id in fetch_response.contents().unwrap_or_default().iter().filter_map(|object| object.key()).map(|i|
+                    ObjectIdentifier::builder().key(i).build()
+                ) {
+                    yield object_id;
+                }
+            } else {
+                for prefix in fetch_response.common_prefixes().unwrap_or_default()
+                .iter().filter_map(|p| p.prefix().map(|k| ObjectIdentifier::builder().key(k).build())) {
+                    yield prefix;
+                }
+            }
+
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+    }
+}
--- a/s3_scrubber/src/s3_deletion.rs
+++ b/s3_scrubber/src/s3_deletion.rs
@@ -1,434 +0,0 @@
-use std::collections::BTreeMap;
-use std::num::NonZeroUsize;
-use std::sync::Arc;
-use std::time::Duration;
-
-use anyhow::Context;
-use aws_sdk_s3::types::{Delete, ObjectIdentifier};
-use aws_sdk_s3::Client;
-use tokio::sync::mpsc::error::TryRecvError;
-use tokio::sync::mpsc::UnboundedReceiver;
-use tokio::sync::Mutex;
-use tokio::task::JoinSet;
-use tracing::{debug, error, info, info_span, Instrument};
-
-use crate::delete_batch_producer::DeleteBatch;
-use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId, MAX_RETRIES};
-use utils::id::TenantTimelineId;
-
-pub struct S3Deleter {
-    dry_run: bool,
-    concurrent_tasks_count: NonZeroUsize,
-    delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
-    s3_client: Arc<Client>,
-    s3_target: RootTarget,
-}
-
-impl S3Deleter {
-    pub fn new(
-        dry_run: bool,
-        concurrent_tasks_count: NonZeroUsize,
-        s3_client: Arc<Client>,
-        delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
-        s3_target: RootTarget,
-    ) -> Self {
-        Self {
-            dry_run,
-            concurrent_tasks_count,
-            delete_batch_receiver,
-            s3_client,
-            s3_target,
-        }
-    }
-
-    pub async fn remove_all(self) -> anyhow::Result<DeletionStats> {
-        let mut deletion_tasks = JoinSet::new();
-        for id in 0..self.concurrent_tasks_count.get() {
-            let closure_client = Arc::clone(&self.s3_client);
-            let closure_s3_target = self.s3_target.clone();
-            let closure_batch_receiver = Arc::clone(&self.delete_batch_receiver);
-            let dry_run = self.dry_run;
-            deletion_tasks.spawn(
-                async move {
-                    info!("Task started");
-                    (
-                        id,
-                        async move {
-                            let mut task_stats = DeletionStats::default();
-                            loop {
-                                let mut guard = closure_batch_receiver.lock().await;
-                                let receiver_result = guard.try_recv();
-                                drop(guard);
-                                match receiver_result {
-                                    Ok(batch) => {
-                                        let stats = delete_batch(
-                                            &closure_client,
-                                            &closure_s3_target,
-                                            batch,
-                                            dry_run,
-                                        )
-                                        .await
-                                        .context("batch deletion")?;
-                                        debug!(
-                                            "Batch processed, number of objects deleted per tenant in the batch is: {}, per timeline — {}",
-                                            stats.deleted_tenant_keys.len(),
-                                            stats.deleted_timeline_keys.len(),
-                                        );
-                                        task_stats.merge(stats);
-                                    }
-                                    Err(TryRecvError::Empty) => {
-                                        debug!("No tasks yet, waiting");
-                                        tokio::time::sleep(Duration::from_secs(1)).await;
-                                        continue;
-                                    }
-                                    Err(TryRecvError::Disconnected) => {
-                                        info!("Task finished: sender dropped");
-                                        return Ok(task_stats);
-                                    }
-                                }
-                            }
-                        }
-                        .in_current_span()
-                        .await,
-                    )
-                }
-                .instrument(info_span!("deletion_task", %id)),
-            );
-        }
-
-        let mut total_stats = DeletionStats::default();
-        while let Some(task_result) = deletion_tasks.join_next().await {
-            match task_result {
-                Ok((id, Ok(task_stats))) => {
-                    info!("Task {id} completed");
-                    total_stats.merge(task_stats);
-                }
-                Ok((id, Err(e))) => {
-                    error!("Task {id} failed: {e:#}");
-                    return Err(e);
-                }
-                Err(join_error) => anyhow::bail!("Failed to join on a task: {join_error:?}"),
-            }
-        }
-
-        Ok(total_stats)
-    }
-}
-
-/// S3 delete_objects allows up to 1000 keys to be passed in a single request.
-/// Yet if you pass too many key requests, apparently S3 could return with OK and
-/// actually delete nothing, so keep the number lower.
-const MAX_ITEMS_TO_DELETE: usize = 200;
-
-#[derive(Debug, Default)]
-pub struct DeletionStats {
-    pub deleted_tenant_keys: BTreeMap<TenantId, usize>,
-    pub deleted_timeline_keys: BTreeMap<TenantTimelineId, usize>,
-}
-
-impl DeletionStats {
-    fn merge(&mut self, other: Self) {
-        self.deleted_tenant_keys.extend(other.deleted_tenant_keys);
-        self.deleted_timeline_keys
-            .extend(other.deleted_timeline_keys);
-    }
-}
-
-async fn delete_batch(
-    s3_client: &Client,
-    s3_target: &RootTarget,
-    batch: DeleteBatch,
-    dry_run: bool,
-) -> anyhow::Result<DeletionStats> {
-    let (deleted_tenant_keys, deleted_timeline_keys) = tokio::join!(
-        delete_tenants_batch(batch.tenants, s3_target, s3_client, dry_run),
-        delete_timelines_batch(batch.timelines, s3_target, s3_client, dry_run),
-    );
-
-    Ok(DeletionStats {
-        deleted_tenant_keys: deleted_tenant_keys.context("tenant batch deletion")?,
-        deleted_timeline_keys: deleted_timeline_keys.context("timeline batch deletion")?,
-    })
-}
-
-async fn delete_tenants_batch(
-    batched_tenants: Vec<TenantId>,
-    s3_target: &RootTarget,
-    s3_client: &Client,
-    dry_run: bool,
-) -> Result<BTreeMap<TenantId, usize>, anyhow::Error> {
-    info!("Deleting tenants batch of size {}", batched_tenants.len());
-    info!("Tenant ids to remove: {batched_tenants:?}");
-    let deleted_keys = delete_elements(
-        &batched_tenants,
-        s3_target,
-        s3_client,
-        dry_run,
-        |root_target, tenant_to_delete| root_target.tenant_root(&tenant_to_delete),
-    )
-    .await?;
-
-    if !dry_run {
-        let mut last_err = None;
-        for _ in 0..MAX_RETRIES {
-            match ensure_tenant_batch_deleted(s3_client, s3_target, &batched_tenants).await {
-                Ok(()) => {
-                    last_err = None;
-                    break;
-                }
-                Err(e) => {
-                    error!("Failed to ensure the tenant batch is deleted: {e}");
-                    last_err = Some(e);
-                }
-            }
-        }
-
-        if let Some(e) = last_err {
-            anyhow::bail!(
-                "Failed to ensure that tenant batch is deleted {MAX_RETRIES} times: {e:?}"
-            );
-        }
-    }
-
-    Ok(deleted_keys)
-}
-
-async fn delete_timelines_batch(
-    batched_timelines: Vec<TenantTimelineId>,
-    s3_target: &RootTarget,
-    s3_client: &Client,
-    dry_run: bool,
-) -> Result<BTreeMap<TenantTimelineId, usize>, anyhow::Error> {
-    info!(
-        "Deleting timelines batch of size {}",
-        batched_timelines.len()
-    );
-    info!(
-        "Timeline ids to remove: {:?}",
-        batched_timelines
-            .iter()
-            .map(|id| id.to_string())
-            .collect::<Vec<_>>()
-    );
-    let deleted_keys = delete_elements(
-        &batched_timelines,
-        s3_target,
-        s3_client,
-        dry_run,
-        |root_target, timeline_to_delete| root_target.timeline_root(&timeline_to_delete),
-    )
-    .await?;
-
-    if !dry_run {
-        let mut last_err = None;
-        for _ in 0..MAX_RETRIES {
-            match ensure_timeline_batch_deleted(s3_client, s3_target, &batched_timelines).await {
-                Ok(()) => {
-                    last_err = None;
-                    break;
-                }
-                Err(e) => {
-                    error!("Failed to ensure the timelines batch is deleted: {e}");
-                    last_err = Some(e);
-                }
-            }
-        }
-
-        if let Some(e) = last_err {
-            anyhow::bail!(
-                "Failed to ensure that timeline batch is deleted {MAX_RETRIES} times: {e:?}"
-            );
-        }
-    }
-    Ok(deleted_keys)
-}
-
-async fn delete_elements<I>(
-    batched_ids: &Vec<I>,
-    s3_target: &RootTarget,
-    s3_client: &Client,
-    dry_run: bool,
-    target_producer: impl Fn(&RootTarget, I) -> S3Target,
-) -> Result<BTreeMap<I, usize>, anyhow::Error>
-where
-    I: Ord + PartialOrd + Copy,
-{
-    let mut deleted_keys = BTreeMap::new();
-    let mut object_ids_to_delete = Vec::with_capacity(MAX_ITEMS_TO_DELETE);
-    for &id_to_delete in batched_ids {
-        let mut continuation_token = None;
-        let mut subtargets = vec![target_producer(s3_target, id_to_delete)];
-        while let Some(current_target) = subtargets.pop() {
-            loop {
-                let fetch_response = list_objects_with_retries(
-                    s3_client,
-                    &current_target,
-                    continuation_token.clone(),
-                )
-                .await?;
-
-                for object_id in fetch_response
-                    .contents()
-                    .unwrap_or_default()
-                    .iter()
-                    .filter_map(|object| object.key())
-                    .map(|key| ObjectIdentifier::builder().key(key).build())
-                {
-                    if object_ids_to_delete.len() >= MAX_ITEMS_TO_DELETE {
-                        let object_ids_for_request = std::mem::replace(
-                            &mut object_ids_to_delete,
-                            Vec::with_capacity(MAX_ITEMS_TO_DELETE),
-                        );
-                        send_delete_request(
-                            s3_client,
-                            s3_target.bucket_name(),
-                            object_ids_for_request,
-                            dry_run,
-                        )
-                        .await
-                        .context("object ids deletion")?;
-                    }
-
-                    object_ids_to_delete.push(object_id);
-                    *deleted_keys.entry(id_to_delete).or_default() += 1;
-                }
-
-                subtargets.extend(
-                    fetch_response
-                        .common_prefixes()
-                        .unwrap_or_default()
-                        .iter()
-                        .filter_map(|common_prefix| common_prefix.prefix())
-                        .map(|prefix| {
-                            let mut new_target = current_target.clone();
-                            new_target.prefix_in_bucket = prefix.to_string();
-                            new_target
-                        }),
-                );
-
-                match fetch_response.next_continuation_token {
-                    Some(new_token) => continuation_token = Some(new_token),
-                    None => break,
-                }
-            }
-        }
-    }
-    if !object_ids_to_delete.is_empty() {
-        info!("Removing last objects of the batch");
-        send_delete_request(
-            s3_client,
-            s3_target.bucket_name(),
-            object_ids_to_delete,
-            dry_run,
-        )
-        .await
-        .context("Last object ids deletion")?;
-    }
-    Ok(deleted_keys)
-}
-
-pub async fn send_delete_request(
-    s3_client: &Client,
-    bucket_name: &str,
-    ids: Vec<ObjectIdentifier>,
-    dry_run: bool,
-) -> anyhow::Result<()> {
-    info!("Removing {} object ids from S3", ids.len());
-    info!("Object ids to remove: {ids:?}");
-    let delete_request = s3_client
-        .delete_objects()
-        .bucket(bucket_name)
-        .delete(Delete::builder().set_objects(Some(ids)).build());
-    if dry_run {
-        info!("Dry run, skipping the actual removal");
-        Ok(())
-    } else {
-        let original_request = delete_request.clone();
-
-        for _ in 0..MAX_RETRIES {
-            match delete_request
-                .clone()
-                .send()
-                .await
-                .context("delete request processing")
-            {
-                Ok(delete_response) => {
-                    info!("Delete response: {delete_response:?}");
-                    match delete_response.errors() {
-                        Some(delete_errors) => {
-                            error!("Delete request returned errors: {delete_errors:?}");
-                            tokio::time::sleep(Duration::from_secs(1)).await;
-                        }
-                        None => {
-                            info!("Successfully removed an object batch from S3");
-                            return Ok(());
-                        }
-                    }
-                }
-                Err(e) => {
-                    error!("Failed to send a delete request: {e:#}");
-                    tokio::time::sleep(Duration::from_secs(1)).await;
-                }
-            }
-        }
-
-        error!("Failed to do deletion, request: {original_request:?}");
-        anyhow::bail!("Failed to run deletion request {MAX_RETRIES} times");
-    }
-}
-
-async fn ensure_tenant_batch_deleted(
-    s3_client: &Client,
-    s3_target: &RootTarget,
-    batch: &[TenantId],
-) -> anyhow::Result<()> {
-    let mut not_deleted_tenants = Vec::with_capacity(batch.len());
-
-    for &tenant_id in batch {
-        let fetch_response =
-            list_objects_with_retries(s3_client, &s3_target.tenant_root(&tenant_id), None).await?;
-
-        if fetch_response.is_truncated()
-            || fetch_response.contents().is_some()
-            || fetch_response.common_prefixes().is_some()
-        {
-            error!(
-                "Tenant {tenant_id} should be deleted, but its list response is {fetch_response:?}"
-            );
-            not_deleted_tenants.push(tenant_id);
-        }
-    }
-
-    anyhow::ensure!(
-        not_deleted_tenants.is_empty(),
-        "Failed to delete all tenants in a batch. Tenants {not_deleted_tenants:?} should be deleted."
-    );
-    Ok(())
-}
-
-async fn ensure_timeline_batch_deleted(
-    s3_client: &Client,
-    s3_target: &RootTarget,
-    batch: &[TenantTimelineId],
-) -> anyhow::Result<()> {
-    let mut not_deleted_timelines = Vec::with_capacity(batch.len());
-
-    for &id in batch {
-        let fetch_response =
-            list_objects_with_retries(s3_client, &s3_target.timeline_root(&id), None).await?;
-
-        if fetch_response.is_truncated()
-            || fetch_response.contents().is_some()
-            || fetch_response.common_prefixes().is_some()
-        {
-            error!("Timeline {id} should be deleted, but its list response is {fetch_response:?}");
-            not_deleted_timelines.push(id);
-        }
-    }
-
-    anyhow::ensure!(
-        not_deleted_timelines.is_empty(),
-        "Failed to delete all timelines in a batch"
-    );
-    Ok(())
-}
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -1,17 +1,15 @@
 use std::collections::{HashMap, HashSet};
-use std::sync::Arc;

 use crate::checks::{
    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
    TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_logging, init_s3_client, BucketConfig, RootTarget, S3Target, CLI_NAME};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget};
 use aws_sdk_s3::Client;
-use aws_types::region::Region;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
-use pageserver::tenant::{IndexPart, TENANTS_SEGMENT_NAME};
+use pageserver::tenant::IndexPart;
 use utils::id::TenantTimelineId;

 pub struct MetadataSummary {
@@ -175,25 +173,7 @@ Timeline layer count: {6}

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
 pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
-    let file_name = format!(
-        "{}_scan_metadata_{}_{}.log",
-        CLI_NAME,
-        bucket_config.bucket,
-        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
-    );
-
-    let _guard = init_logging(&file_name);
-
-    let s3_client = Arc::new(init_s3_client(
-        bucket_config.sso_account_id,
-        Region::new(bucket_config.region),
-    ));
-    let delimiter = "/";
-    let target = RootTarget::Pageserver(S3Target {
-        bucket_name: bucket_config.bucket.to_string(),
-        prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(delimiter),
-        delimiter: delimiter.to_string(),
-    });
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;

    let tenants = stream_tenants(&s3_client, &target);

--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -50,3 +50,7 @@ storage_broker.workspace = true
 utils.workspace = true

 workspace_hack.workspace = true
+
+[[bench]]
+name = "sk_capacity"
+harness = false
--- a/safekeeper/benches/sk_capacity.rs
+++ b/safekeeper/benches/sk_capacity.rs
@@ -0,0 +1,64 @@
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use clap::Parser;
+use safekeeper_api::models::TimelineCreateRequest;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+const ABOUT: &str = r#"
+Creates many random timelines on the safekeeper.
+
+For example,
+cargo build -r -p safekeeper && target/release/safekeeper
+cargo bench --bench sk_capacity -- -n 1000 --http-addr=http://127.0.0.1:7676
+"#;
+
+#[derive(Parser, Debug)]
+#[clap(author, version, about = ABOUT)]
+struct Args {
+    /// Number of timelines to create
+    #[clap(short = 'n', long, value_parser, default_value_t = 1)]
+    num_timelines: u64,
+    /// HTTP safekeeper address
+    #[clap(long)]
+    http_addr: String,
+    // Fake value to satisfy `cargo bench` passing it.
+    #[clap(long)]
+    bench: bool,
+}
+
+async fn create_timeline(args: &Args) -> Result<(), Box<dyn std::error::Error>> {
+    let client = reqwest::Client::new();
+
+    let ttid = TenantTimelineId::generate();
+
+    let request = TimelineCreateRequest {
+        tenant_id: ttid.tenant_id,
+        timeline_id: ttid.timeline_id,
+        peer_ids: None,
+        pg_version: 160000,
+        system_id: None,
+        wal_seg_size: None,
+        commit_lsn: Lsn(21623024),
+        local_start_lsn: None,
+    };
+
+    // Send request to /v1/tenant/timeline
+    let url = format!("{}/v1/tenant/timeline", args.http_addr);
+    let res = client.post(url).json(&request).send().await?;
+
+    println!("Response: {:?}", res.status());
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Args::parse();
+
+    for i in 0..args.num_timelines {
+        create_timeline(&args).await?;
+    }
+    Ok(())
+}
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -42,7 +42,7 @@ use utils::auth::{JwtAuth, Scope};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
-    project_git_version,
+    project_build_tag, project_git_version,
    sentry_init::init_sentry,
    tcp_listener,
 };
@@ -51,6 +51,7 @@ const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 const ABOUT: &str = r#"
 A fleet of safekeepers is responsible for reliably storing WAL received from
@@ -204,6 +205,7 @@ async fn main() -> anyhow::Result<()> {
    )?;
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    info!("version: {GIT_VERSION}");
+    info!("buld_tag: {BUILD_TAG}");

    let args_workdir = &args.datadir;
    let workdir = args_workdir.canonicalize_utf8().with_context(|| {
@@ -423,7 +425,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("WAL remover".to_owned(), res));
    tasks_handles.push(Box::pin(wal_remover_handle));

-    set_build_info_metric(GIT_VERSION);
+    set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // TODO: update tokio-stream, convert to real async Stream with
    // SignalStream, map it to obtain missing signal name, combine streams into
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -140,6 +140,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                }
            }

+            let ttid = TenantTimelineId::new(
+                self.tenant_id.unwrap_or(TenantId::from([0u8; 16])),
+                self.timeline_id.unwrap_or(TimelineId::from([0u8; 16])),
+            );
+            tracing::Span::current().record("ttid", tracing::field::display(ttid));
+
            Ok(())
        } else {
            Err(QueryError::Other(anyhow::anyhow!(
@@ -208,26 +214,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
            PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc();
        }

-        info!(
-            "got query {:?} in timeline {:?}",
-            query_string, self.timeline_id
-        );
+        info!("got query {:?}", query_string);

        let tenant_id = self.tenant_id.context("tenantid is required")?;
        let timeline_id = self.timeline_id.context("timelineid is required")?;
        self.check_permission(Some(tenant_id))?;
        self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
-        let span_ttid = self.ttid; // satisfy borrow checker

        match cmd {
            SafekeeperPostgresCommand::StartWalPush => {
                self.handle_start_wal_push(pgb)
-                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
+                    .instrument(info_span!("WAL receiver"))
                    .await
            }
            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
                self.handle_start_replication(pgb, start_lsn, term)
-                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
+                    .instrument(info_span!("WAL sender"))
                    .await
            }
            SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -16,20 +16,16 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                continue;
            }
            let ttid = tli.ttid;
-            if let Err(e) = tli
-                .maybe_persist_control_file()
-                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
-                .await
-            {
-                warn!("failed to persist control file: {e}");
-            }
-            if let Err(e) = tli
-                .remove_old_wal(conf.wal_backup_enabled)
-                .instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
-                .await
-            {
-                error!("failed to remove WAL: {}", e);
+            async {
+                if let Err(e) = tli.maybe_persist_control_file().await {
+                    warn!("failed to persist control file: {e}");
+                }
+                if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
+                    error!("failed to remove WAL: {}", e);
+                }
            }
+            .instrument(info_span!("WAL removal", ttid = %ttid))
+            .await;
        }
        sleep(wal_removal_interval).await;
    }
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -296,8 +296,8 @@ impl GlobalTimelines {
        global_lock
            .timelines
            .values()
-            .cloned()
            .filter(|t| !t.is_cancelled())
+            .cloned()
            .collect()
    }

--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -136,7 +136,7 @@ async fn update_task(

    if elected_me != (entry.handle.is_some()) {
        if elected_me {
-            info!("elected for backup {}: {}", ttid, election_dbg_str);
+            info!("elected for backup: {}", election_dbg_str);

            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
            let timeline_dir = conf.timeline_dir(&ttid);
@@ -149,7 +149,7 @@ async fn update_task(
                    conf.backup_parallel_jobs,
                    shutdown_rx,
                )
-                .instrument(info_span!("WAL backup task", ttid = %ttid)),
+                .in_current_span(),
            );

            entry.handle = Some(WalBackupTaskHandle {
@@ -157,7 +157,7 @@ async fn update_task(
                handle,
            });
        } else {
-            info!("stepping down from backup {}: {}", ttid, election_dbg_str);
+            info!("stepping down from backup: {}", election_dbg_str);
            shut_down_task(ttid, entry).await;
        }
    }
@@ -199,29 +199,33 @@ pub async fn wal_backup_launcher_task_main(
                if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
                    continue; /* just drain the channel and do nothing */
                }
-                let timeline = is_wal_backup_required(ttid).await;
-                // do we need to do anything at all?
-                if timeline.is_some() != tasks.contains_key(&ttid) {
-                    if let Some(timeline) = timeline {
-                        // need to start the task
-                        let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
-                            timeline,
-                            handle: None,
-                        });
-                        update_task(&conf, ttid, entry).await;
-                    } else {
-                        // need to stop the task
-                        info!("stopping WAL backup task for {}", ttid);
-                        let mut entry = tasks.remove(&ttid).unwrap();
-                        shut_down_task(ttid, &mut entry).await;
+                async {
+                    let timeline = is_wal_backup_required(ttid).await;
+                    // do we need to do anything at all?
+                    if timeline.is_some() != tasks.contains_key(&ttid) {
+                        if let Some(timeline) = timeline {
+                            // need to start the task
+                            let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
+                                timeline,
+                                handle: None,
+                            });
+                            update_task(&conf, ttid, entry).await;
+                        } else {
+                            // need to stop the task
+                            info!("stopping WAL backup task");
+                            let mut entry = tasks.remove(&ttid).unwrap();
+                            shut_down_task(ttid, &mut entry).await;
+                        }
                    }
-                }
+                }.instrument(info_span!("WAL backup", ttid = %ttid)).await;
            }
            // For each timeline needing offloading, check if this safekeeper
            // should do the job and start/stop the task accordingly.
            _ = ticker.tick() => {
                for (ttid, entry) in tasks.iter_mut() {
-                    update_task(&conf, *ttid, entry).await;
+                    update_task(&conf, *ttid, entry)
+                        .instrument(info_span!("WAL backup", ttid = %ttid))
+                        .await;
                }
            }
        }
@@ -248,7 +252,7 @@ async fn backup_task_main(
    info!("started");
    let res = GlobalTimelines::get(ttid);
    if let Err(e) = res {
-        error!("backup error for timeline {}: {}", ttid, e);
+        error!("backup error: {}", e);
        return;
    }
    let tli = res.unwrap();
@@ -346,7 +350,7 @@ impl WalBackupTask {
    }
 }

-pub async fn backup_lsn_range(
+async fn backup_lsn_range(
    timeline: &Arc<Timeline>,
    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -36,14 +36,14 @@ pub async fn task_main(
        let conf = conf.clone();
        let conn_id = issue_connection_id(&mut connection_count);

-        tokio::spawn(async move {
-            if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope)
-                .instrument(info_span!("", cid = %conn_id))
-                .await
-            {
-                error!("connection handler exited: {}", err);
+        tokio::spawn(
+            async move {
+                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
+                    error!("connection handler exited: {}", err);
+                }
            }
-        });
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty)),
+        );
    }
 }

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -188,7 +188,7 @@ impl PhysicalStorage {
    }

    /// Call fdatasync if config requires so.
-    async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
+    async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
                .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
@@ -197,7 +197,7 @@ impl PhysicalStorage {
    }

    /// Call fsync if config requires so.
-    async fn fsync_file(&mut self, file: &mut File) -> Result<()> {
+    async fn fsync_file(&mut self, file: &File) -> Result<()> {
        if !self.conf.no_sync {
            self.metrics
                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
@@ -231,7 +231,7 @@ impl PhysicalStorage {
                .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;

            write_zeroes(&mut file, self.wal_seg_size).await?;
-            self.fsync_file(&mut file).await?;
+            self.fsync_file(&file).await?;
            Ok((file, true))
        }
    }
@@ -255,7 +255,7 @@ impl PhysicalStorage {

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
-            self.fdatasync_file(&mut file).await?;
+            self.fdatasync_file(&file).await?;

            // Rename partial file to completed file
            let (wal_file_path, wal_file_partial_path) =
@@ -277,8 +277,8 @@ impl PhysicalStorage {
    async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
        if self.write_lsn != pos {
            // need to flush the file before discarding it
-            if let Some(mut file) = self.file.take() {
-                self.fdatasync_file(&mut file).await?;
+            if let Some(file) = self.file.take() {
+                self.fdatasync_file(&file).await?;
            }

            self.write_lsn = pos;
@@ -367,8 +367,8 @@ impl Storage for PhysicalStorage {
            return Ok(());
        }

-        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file).await?;
+        if let Some(unflushed_file) = self.file.take() {
+            self.fdatasync_file(&unflushed_file).await?;
            self.file = Some(unflushed_file);
        } else {
            // We have unflushed data (write_lsn != flush_lsn), but no file.
@@ -410,8 +410,8 @@ impl Storage for PhysicalStorage {
        }

        // Close previously opened file, if any
-        if let Some(mut unflushed_file) = self.file.take() {
-            self.fdatasync_file(&mut unflushed_file).await?;
+        if let Some(unflushed_file) = self.file.take() {
+            self.fdatasync_file(&unflushed_file).await?;
        }

        let xlogoff = end_pos.segment_offset(self.wal_seg_size);
@@ -425,7 +425,7 @@ impl Storage for PhysicalStorage {
        // Fill end with zeroes
        file.seek(SeekFrom::Start(xlogoff as u64)).await?;
        write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
-        self.fdatasync_file(&mut file).await?;
+        self.fdatasync_file(&file).await?;

        if !is_partial {
            // Make segment partial once again
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -44,10 +44,11 @@ use storage_broker::{
 };
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
-use utils::project_git_version;
 use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};

 project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);

 const DEFAULT_CHAN_SIZE: usize = 32;
 const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384;
@@ -438,7 +439,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    // initialize sentry if SENTRY_DSN is provided
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
    info!("version: {GIT_VERSION}");
-    ::metrics::set_build_info_metric(GIT_VERSION);
+    info!("build_tag: {BUILD_TAG}");
+    metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

    // On any shutdown signal, log receival and exit.
    std::thread::spawn(move || {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -60,6 +60,7 @@ from fixtures.utils import (
    allure_attach_from_dir,
    get_self_dir,
    subprocess_capture,
+    wait_until,
 )

 """
@@ -1444,6 +1445,20 @@ class NeonCli(AbstractNeonCli):
        res.check_returncode()
        return res

+    def endpoint_reconfigure(
+        self,
+        endpoint_id: str,
+        tenant_id: Optional[TenantId] = None,
+        pageserver_id: Optional[int] = None,
+        check_return_code=True,
+    ) -> "subprocess.CompletedProcess[str]":
+        args = ["endpoint", "reconfigure", endpoint_id]
+        if tenant_id is not None:
+            args.extend(["--tenant-id", str(tenant_id)])
+        if pageserver_id is not None:
+            args.extend(["--pageserver-id", str(pageserver_id)])
+        return self.raw_cli(args, check_return_code=check_return_code)
+
    def endpoint_stop(
        self,
        endpoint_id: str,
@@ -1539,8 +1554,8 @@ class NeonAttachmentService:

    def attach_hook(self, tenant_id: TenantId, pageserver_id: int) -> int:
        response = requests.post(
-            f"{self.env.control_plane_api}/attach_hook",
-            json={"tenant_id": str(tenant_id), "pageserver_id": pageserver_id},
+            f"{self.env.control_plane_api}/attach-hook",
+            json={"tenant_id": str(tenant_id), "node_id": pageserver_id},
        )
        response.raise_for_status()
        gen = response.json()["gen"]
@@ -1616,22 +1631,21 @@ class NeonPageserver(PgProtocol):
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
-            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed.*, retrying in.*timeline or pageserver is shutting down",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
            # this is until #3501
-            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            ".*Compaction failed.*, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
            # these can happen anytime we do compactions from background task and shutdown pageserver
            r".*ERROR.*ancestor timeline \S+ is being stopped",
            # this is expected given our collaborative shutdown approach for the UploadQueue
-            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
+            ".*Compaction failed.*, retrying in .*: queue is in state Stopped.*",
            # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
            ".*Error processing HTTP request: NotFound: Timeline .* was not found",
            ".*took more than expected to complete.*",
            # these can happen during shutdown, but it should not be a reason to fail a test
            ".*completed, took longer than expected.*",
-            '.*registered custom resource manager \\\\"neon\\\\".*',
            # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
            # and it is not a failure of our code when it happens.
            ".*DeleteObjects.*We encountered an internal error. Please try again.*",
@@ -1680,6 +1694,40 @@ class NeonPageserver(PgProtocol):
            self.running = False
        return self

+    def restart(self, immediate: bool = False):
+        """
+        High level wrapper for restart: restarts the process, and waits for
+        tenant state to stabilize.
+        """
+        self.stop(immediate=immediate)
+        self.start()
+        self.quiesce_tenants()
+
+    def quiesce_tenants(self):
+        """
+        Wait for all tenants to enter a stable state (Active or Broken)
+
+        Call this after restarting the pageserver, or after attaching a tenant,
+        to ensure that it is ready for use.
+        """
+
+        stable_states = {"Active", "Broken"}
+
+        client = self.http_client()
+
+        def complete():
+            log.info("Checking tenants...")
+            tenants = client.tenant_list()
+            log.info(f"Tenant list: {tenants}...")
+            any_unstable = any((t["state"]["slug"] not in stable_states) for t in tenants)
+            if any_unstable:
+                for t in tenants:
+                    log.info(f"Waiting for tenant {t['id']} in state {t['state']['slug']}")
+            log.info(f"any_unstable={any_unstable}")
+            assert not any_unstable
+
+        wait_until(20, 0.5, complete)
+
    def __enter__(self) -> "NeonPageserver":
        return self

@@ -2500,6 +2548,10 @@ class Endpoint(PgProtocol):

        return self

+    def reconfigure(self, pageserver_id: Optional[int] = None):
+        assert self.endpoint_id is not None
+        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
+
    def respec(self, **kwargs):
        """Update the endpoint.json file used by control_plane."""
        # Read config
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in the wild by tests with the below contradicting logging
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
            # this seems like a mock_s3 issue
-            log.warn(
+            log.warning(
                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
            )
            keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in one case with mock_s3:
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warn(
+            log.warning(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

--- a/Show More
+++ b/Show More