wip

Switch the locks to tokio ones
2026-02-07 20:50:38 +00:00 · 2023-09-08 18:25:05 +02:00 · 2023-09-07 19:32:42 +02:00 · 2023-09-07 19:32:42 +02:00 · 2023-09-07 19:25:56 +02:00 · 2023-09-07 17:47:18 +02:00
116 changed files with 6667 additions and 4381 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,6 +14,7 @@
 !pgxn/
 !proxy/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
 !vendor/postgres-v14/
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -2,7 +2,9 @@ name: Handle `approved-for-ci-run` label
 # This workflow helps to run CI pipeline for PRs made by external contributors (from forks).

 on:
-  pull_request:
+  pull_request_target:
+    branches:
+      - main
    types:
      # Default types that triggers a workflow ([1]):
      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -18,29 +20,34 @@ env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}

+permissions: write-all
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+
 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

-    runs-on: [ ubuntu-latest ]
-
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
-
-    runs-on: [ ubuntu-latest ]
+  create-or-update-pr-for-ci-run:
+    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

@@ -53,3 +60,19 @@ jobs:
      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"

      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
+
+      - name: Create a Pull Request for CI run (if required)
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          HEAD="ci-run/pr-${PR_NUMBER}"
+          BODY="This Pull Request was create automatically to run CI pipeline for #${PR_NUMBER}.\n\nPlease do not alter or merge/close it.\n\nFeel free to comment the original PR."
+
+          ALREADY_CREATED=$(gh pr --repo "${GITHUB_REPOSITORY}" list --head "${HEAD}" --base "main" --json "number" --jq '.[].number')
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create  --title "CI run for PR #${PR_NUMBER}" \
+                                                        --body "${BODY}" \
+                                                        --head "${HEAD}" \
+                                                        --base "main" \
+                                                        --draft
+          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -117,6 +117,7 @@ jobs:
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}

    steps:
    - name: Generate matrix for pgbench benchmark
@@ -158,6 +159,25 @@ jobs:

        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT

+    - name: Generate matrix for TPC-H benchmarks
+      id: tpch-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ],
+          "scale": [
+            "10"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+        fi
+
+        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
  pgbench-compare:
    needs: [ generate-matrices ]

@@ -233,7 +253,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [ "${PLATFORM}" = "neon"* ]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -358,7 +382,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [ "${PLATFORM}" = "neon"* ]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -372,6 +400,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -398,7 +427,7 @@ jobs:

    strategy:
      fail-fast: false
-      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -407,6 +436,7 @@ jobs:
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
+      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -428,18 +458,17 @@ jobs:
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

-    - name: Set up Connection String
-      id: set-up-connstr
+    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -447,9 +476,21 @@ jobs:
            ;;
        esac

+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${SCALE}_CONNSTR"
+        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
+
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [ "${PLATFORM}" = "neon"* ]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -463,6 +504,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: ${{ matrix.scale }}

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -534,7 +576,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [ "${PLATFORM}" = "neon"* ]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -422,7 +421,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests, benchmarks ]
+    needs: [ regress-tests, coverage-report, benchmarks ]
    if: ${{ !cancelled() }}

    steps:
@@ -449,12 +448,18 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

+            const coverage = {
+              coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
+              summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
+            }
+
            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
              fetch,
              report,
+              coverage,
            })

  coverage-report:
@@ -467,24 +472,15 @@ jobs:
      fail-fast: false
      matrix:
        build_type: [ debug ]
+    outputs:
+        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
+        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
-          fetch-depth: 1
-
-#      Disabled for now
-#      - name: Restore cargo deps cache
-#        id: cache_cargo
-#        uses: actions/cache@v3
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+          fetch-depth: 0

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -527,13 +523,45 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

+      - name: Build coverage report NEW
+        id: upload-coverage-report-new
+        env:
+          BUCKET: neon-github-public-dev
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        run: |
+          BASELINE="$(git merge-base HEAD origin/main)"
+          CURRENT="${COMMIT_SHA}"
+
+          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
+
+          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
+
+          # Use differential coverage if the baseline coverage exists.
+          # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
+          if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
+            git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
+
+            GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
+          fi
+
+          genhtml ${GENHTML_ARGS}
+
+          aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
+          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
+
      - uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, COMMIT_SHA } = process.env
+            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
@@ -544,6 +572,15 @@ jobs:
              context: 'Code coverage report',
            })

+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL_NEW}`,
+              context: 'Code coverage report NEW',
+            })
+
  trigger-e2e-tests:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -899,17 +936,13 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  build-private-extensions:
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
+  trigger-custom-extensions-build-and-wait:
+    runs-on: ubuntu-latest
    needs: [ tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

          curl -f -X POST \
@@ -939,10 +972,48 @@ jobs:
              }
            }"

+      - name: Wait for extension build to finish
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          INTERVAL=15 # try each N seconds
+
+          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
+
+          for ((i=0; i <= $TIMEOUT; i+=$INTERVAL)); do
+            sleep $INTERVAL
+
+            # Get statuses for the latest commit in the PR / branch
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
+
+            # Get the latest status for the "build-and-upload-extensions" context
+            last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
+            if [ "${last_status}" = "pending" ]; then
+              # Extension build is still in progress.
+              continue
+            elif [ "${last_status}" = "success" ]; then
+              # Extension build is successful.
+              exit 0
+            else
+              # Status is neither "pending" nor "success", exit the loop and fail the job.
+              break
+            fi
+          done
+
+          # Extension build failed, print `statuses.json` for debugging and fail the job.
+          jq '.' statuses.json
+
+          echo >&2 "Status of extension build is '${last_status}' != 'success'"
+          exit 1
+
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -4,7 +4,6 @@ on:
  push:
    branches:
      - main
-      - ci-run/pr-*
  pull_request:

 defaults:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 10 * * 2'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -37,11 +38,11 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.27"
-aws-smithy-http = "0.55"
-aws-credential-types = "0.55"
-aws-types = "0.55"
+aws-config = { version = "0.56", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.29"
+aws-smithy-http = "0.56"
+aws-credential-types = "0.56"
+aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -105,12 +106,12 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.20"
+rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
-sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
@@ -125,11 +126,11 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.9.0"
-tokio-rustls = "0.23"
+tokio-postgres-rustls = "0.10.0"
+tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -143,7 +144,7 @@ tracing-subscriber = { version = "0.3", default_features = false, features = ["s
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.23"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -182,8 +183,8 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.10"
-rstest = "0.17"
+rcgen = "0.11"
+rstest = "0.18"
 tempfile = "3.4"
 tonic-build = "0.9"

--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,39 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Ok, Result};
+use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument};

 use crate::compute::ComputeNode;

+/// Create a special service table for availability checks
+/// only if it does not exist already.
+pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+        DO $$
+        BEGIN
+            IF NOT EXISTS(
+                SELECT 1
+                FROM pg_catalog.pg_tables
+                WHERE tablename = 'health_check'
+            )
+            THEN
+            CREATE TABLE health_check (
+                id serial primary key,
+                updated_at timestamptz default now()
+            );
+            INSERT INTO health_check VALUES (1, now())
+                ON CONFLICT (id) DO UPDATE
+                 SET updated_at = now();
+            END IF;
+        END
+        $$;";
+    client.execute(query, &[])?;
+
+    Ok(())
+}
+
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
-/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -24,19 +51,15 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
-    CREATE TABLE IF NOT EXISTS health_check (
-        id serial primary key,
-        updated_at timestamptz default now()
-    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

    let result = client.simple_query(query).await?;

-    if result.len() != 2 {
+    if result.len() != 1 {
        return Err(anyhow::format_err!(
-            "expected 2 query results, but got {}",
+            "expected 1 query result, but got {}",
            result.len()
        ));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -27,6 +27,7 @@ use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};

+use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -696,6 +697,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -1078,7 +1080,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -180,7 +180,19 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
+    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &remote_extensions.public_extensions {
+            if !public_extensions.contains(ext_name) {
+                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
+                    if !custom_extensions.contains(ext_name) {
+                        continue; // skip this extension, it is not allowed
+                    }
+                }
+            }
+        }
+
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,4 +1,6 @@
 use std::convert::Infallible;
+use std::net::IpAddr;
+use std::net::Ipv6Addr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
@@ -169,7 +171,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(&filename, is_library)
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
            };

            match ext {
@@ -293,7 +300,9 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
+    // this usually binds to both IPv4 and IPv6 on linux
+    // see e.g. https://github.com/rust-lang/rust/pull/34440
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
 //   https://www.postgresql.org/docs/15/auth-password.html
 //
 // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
-pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,6 +12,7 @@ git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
+hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,7 +1,7 @@
 use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
-use pageserver_api::control_api::HexTenantId;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -13,9 +13,11 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    pub tenant_id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
    pub pageserver_id: Option<NodeId>,
 }

@@ -89,13 +91,13 @@ impl AttachmentService {
            .expect("Failed to construct http client");

        let request = AttachHookRequest {
-            tenant_id: HexTenantId::new(tenant_id),
+            tenant_id,
            pageserver_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {0}", response.status()));
+            return Err(anyhow!("Unexpected status {}", response.status()));
        }

        let response = response.json::<AttachHookResponse>()?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -6,9 +6,9 @@
 ///
 use anyhow::anyhow;
 use clap::Parser;
+use hex::FromHex;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
-use pageserver_api::control_api::*;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
@@ -25,15 +25,22 @@ use utils::{
    tcp_listener,
 };

+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
+    ValidateResponseTenant,
+};
+
 use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
 struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
    #[arg(short, long)]
-    listen: String,
+    listen: std::net::SocketAddr,

+    /// Path to the .json file to store state (will be created if it doesn't exist)
    #[arg(short, long)]
    path: PathBuf,
 }
@@ -54,13 +61,10 @@ where
    S: serde::Serializer,
    V: Clone + Serialize,
 {
-    eprintln!("to_hex_map");
-    let transformed = input
-        .iter()
-        .map(|(k, v)| (HexTenantId::new(k.clone()), v.clone()));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));

    transformed
-        .collect::<HashMap<HexTenantId, V>>()
+        .collect::<HashMap<String, V>>()
        .serialize(serializer)
 }

@@ -69,10 +73,15 @@ where
    D: serde::de::Deserializer<'de>,
    V: Deserialize<'de>,
 {
-    eprintln!("from_hex_map");
-    let hex_map = HashMap::<HexTenantId, V>::deserialize(deserializer)?;
-
-    Ok(hex_map.into_iter().map(|(k, v)| (k.take(), v)).collect())
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            TenantId::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(serde::de::Error::custom)
+        })
+        .collect()
 }

 // Top level state available to all HTTP handlers
@@ -102,17 +111,24 @@ impl PersistentState {

    async fn load_or_new(path: &Path) -> Self {
        match Self::load(path).await {
-            Ok(s) => s,
-            Err(e) => {
-                tracing::info!(
-                    "Creating new state file at {0} (load returned {e})",
-                    path.to_string_lossy()
-                );
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path.display());
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path.display());
                Self {
                    tenants: HashMap::new(),
                    path: path.to_owned(),
                }
            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
+            }
        }
    }
 }
@@ -153,16 +169,13 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
        if state.pageserver == Some(reattach_req.node_id) {
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
-                id: HexTenantId::new(t.clone()),
+                id: *t,
                generation: state.generation,
            });
        }
    }

-    locked
-        .save()
-        .await
-        .map_err(|e| ApiError::InternalServerError(e))?;
+    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, response)
 }
@@ -172,15 +185,14 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
 async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let validate_req = json_request::<ValidateRequest>(&mut req).await?;

-    let state = get_state(&req).inner.clone();
-    let locked = state.read().await;
+    let locked = get_state(&req).inner.read().await;

    let mut response = ValidateResponse {
        tenants: Vec::new(),
    };

    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(req_tenant.id.as_ref()) {
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
            let valid = tenant_state.generation == req_tenant.gen;
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
@@ -202,7 +214,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap

    let tenant_state = locked
        .tenants
-        .entry(attach_req.tenant_id.take())
+        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
            pageserver: attach_req.pageserver_id,
            generation: 0,
@@ -213,10 +225,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    }
    let generation = tenant_state.generation;

-    locked
-        .save()
-        .await
-        .map_err(|e| ApiError::InternalServerError(e))?;
+    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(
        StatusCode::OK,
@@ -229,9 +238,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| handle_re_attach(r))
-        .post("/validate", |r| handle_validate(r))
-        .post("/attach_hook", |r| handle_attach_hook(r))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach_hook", handle_attach_hook)
 }

 #[tokio::main]
@@ -250,14 +259,14 @@ async fn main() -> anyhow::Result<()> {

    let persistent_state = PersistentState::load_or_new(&args.path).await;

-    let http_listener = tcp_listener::bind(&args.listen)?;
+    let http_listener = tcp_listener::bind(args.listen)?;
    let router = make_router(persistent_state)
        .build()
        .map_err(|err| anyhow!(err))?;
    let service = utils::http::RouterService::new(router).unwrap();
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

-    tracing::info!("Serving on {0}", args.listen.as_str());
+    tracing::info!("Serving on {0}", args.listen);
    server.await?;

    Ok(())
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -354,7 +354,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .unwrap_or_default();

            // If tenant ID was not specified, generate one
-            let tenant_id = parse_tenant_id(create_match)?.unwrap_or(TenantId::generate());
+            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

            let generation = if env.pageserver.control_plane_api.is_some() {
                // We must register the tenant with the attachment service, so
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -138,7 +138,13 @@ impl ComputeControlPlane {
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
+            // We don't setup roles and databases in the spec locally, so we don't need to
+            // do catalog updates. Catalog updates also include check availability
+            // data creation. Yet, we have tests that check that size and db dump
+            // before and after start are the same. So, skip catalog updates,
+            // with this we basically test a case of waking up an idle compute, where
+            // we also skip catalog updates in the cloud.
+            skip_pg_catalog_updates: true,
        });

        ep.create_endpoint_dir()?;
@@ -152,7 +158,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: false,
+                skip_pg_catalog_updates: true,
            })?,
        )?;
        std::fs::write(
--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -0,0 +1,281 @@
+
+# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
+
+* Created on: Aug 23, 2023
+* Author: Christian Schwarz
+
+## Summary
+
+This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
+Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
+
+## Motivation
+
+### Background
+
+We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
+If we crash or restart pageserver, we reconstruct the layer map from:
+1. local timeline directory contents
+2. remote `index_part.json` contents.
+
+The function that is responsible for this is called `Timeline::load_layer_map()`.
+The reconciliation process's behavior is the following:
+* local-only files will become part of the layer map as local-only layers and rescheduled for upload
+* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
+
+### The Problem
+
+There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
+The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
+As stated above, making the update to the layer map in atomic way is trivial.
+But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
+Currently, we issue the system calls one by one and hope we don't crash.
+
+What happens if we crash and restart in the middle of that system call sequence?
+We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
+
+We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
+
+### Problem's Implications For Compaction
+
+The implications of the above are primarily problematic for compaction.
+Specifically, the part of it that compacts L0 layers into L1 layers.
+
+Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
+Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
+It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
+
+If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
+This means the compaction after restart will **overwrite** the previously written L1s.
+Currently we also schedule an S3 upload of the overwritten L1.
+
+If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
+
+*However*:
+1. the file size of the overwritten L1s may not be identical, and
+2. the bit pattern of the overwritten L1s may not be identical, and,
+3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+
+The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
+
+For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
+But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
+That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
+If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
+Effectively, the data in the L1 has become inaccessible to node B.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+
+If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
+
+In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
+
+But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
+
+Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
+**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
+
+## Design
+
+Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
+Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
+
+During **timeline load**, the only thing that matters is the remote index part content.
+Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
+The local timeline dir's `metadata` file does not matter.
+The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
+Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
+The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
+Instead, it treats the remote index part as the authoritative layer map.
+If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
+If it doesn't match, we remove the file from the local timeline dir.
+
+After load, **at runtime**, nothing changes compared to what we did before this RFC.
+The procedure for single- and multi-object changes is reproduced here for reference:
+* For any new layers that the change adds:
+  * Write them to a temporary location.
+  * While holding layer map lock:
+    * Move them to the final location.
+    * Insert into layer map.
+* Make the S3 changes.
+  We won't reproduce the remote timeline client method calls here because these are subject to change.
+  Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
+    * PUT layer files inserted by the change.
+    * PUT an index part that has insertions and deletions of the change.
+    * DELETE the layer files that are deleted by the change.
+
+Note that it is safe for the DELETE to be deferred arbitrarily.
+* If it never happens, we leak the object, but, that's not a correctness concern.
+* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
+* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
+
+## How This Solves The Problem
+
+If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
+The S3 change sequence above is obviously crash-consistent.
+If we crash before the index part PUT, then we leak the inserted layer files to S3.
+If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
+Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
+
+Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
+* atomic layer map update at runtime, currently by using an RwLock in write mode
+* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
+* local timeline dir state:
+  * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash before index part PUT, local layer files will be deleted
+
+## Trade-Offs
+
+### Fundamental
+
+If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
+* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
+* compaction: we lose the entire compaction iteration work; need to re-do it again
+* gc: no change to what we have today
+
+If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
+The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
+Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
+* on-demand downloads that were needed to do the work: are likely still present, not lost
+* wal ingest: currently unbounded
+* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
+  * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
+  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
+  * I have no intuition how expensive / long-running it is in reality.
+* gc: `update_gc_info`` work (not substantial, AFAIK)
+
+To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
+However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
+We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
+However, this RFC is not constraining the design space either.
+
+### Practical
+
+#### Pageserver Restarts
+
+Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
+However, regular pageserver restart happen frequently, e.g., during weekly deploys.
+
+In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
+They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
+A longer budget would expose tenants that are done early to a longer downtime.
+A short budget would risk throwing away more work that'd have to be re-done after restart.
+
+In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
+We can mitigate this problem as follows:
+0. initially, by accepting that we need to do the work again
+1. short-term, introducing measures to cap the amount of in-flight work:
+
+   - cap upload queue length, use backpressure to slow down compaction
+   - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
+   - introducing a read-only shutdown state for tenants that are fast to shut down;
+     that state would be equivalent to the state of a tenant in hot standby / readonly mode.
+
+2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
+
+#### `disk_consistent_lsn` can go backwards
+
+`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
+Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
+Compute certainly doesn't care about `disk_consistent_lsn`.
+
+
+## Side-Effects Of This Design
+
+* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
+
+## Limitations
+
+Multi-object changes that span multiple timelines aren't covered by this RFC.
+That's fine because we currently don't need them, as evidenced by the absence
+of a Pageserver operation that holds multiple timelines' layer map lock at a time.
+
+## Impacted components
+
+Primarily pageservers.
+
+Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
+No changes to safekeepers are needed.
+
+## Alternatives considered
+
+### Alternative 1: WAL
+
+We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
+The WAL would be used to
+1. make multi-object changes atomic
+2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
+
+The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
+* New on-disk state to get right.
+* Forward- and backward-compatibility development costs in the future.
+
+### Alternative 2: Flow Everything Through `index_part.json`
+
+We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
+I.e., layer map would always be the last-persisted S3 state.
+That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
+And it might make hot standbys / read-only pageservers less of a special case in the future.
+
+But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
+
+And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
+
+Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
+
+### Alternative 3: Sequence Numbers For Layers
+
+Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
+
+```
+# before
+tenants/$tenant/timelines/$timeline/$key_and_lsn_range
+# after
+tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
+```
+
+To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+
+This alternative does not solve atomic layer map updates.
+In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
+In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
+We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
+
+However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
+
+So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+The proposed design in this RFC addresses both.
+
+So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
+That way, we avoid a phase where the crash-during-compaction problem is accute.
+
+## Related issues
+
+- https://github.com/neondatabase/neon/issues/4749
+- https://github.com/neondatabase/neon/issues/4418
+  - https://github.com/neondatabase/neon/pull/4422
+- https://github.com/neondatabase/neon/issues/5077
+- https://github.com/neondatabase/neon/issues/4088
+  - (re)resolutions:
+    - https://github.com/neondatabase/neon/pull/4696
+    - https://github.com/neondatabase/neon/pull/4094
+      - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
+
+Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that  assumption in order to fix the problem.
+
+
+## Implementation Plan
+
+1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
+
+    - The nasty part here is to fix all the tests that fiddle with the local timeline directory.
+      Possibly they are just irrelevant with this change, but, each case will require inspection.
+
+2. Implement the design above.
+
+    - Initially, ship without the mitigations for restart and accept we will do some work twice.
+    - Measure the impact and implement one of the mitigations.
+
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -89,6 +89,8 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
+        build_tag: &str,
+        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -104,11 +106,32 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &self.public_extensions {
+            if !public_extensions.contains(&real_ext_name.to_string()) {
+                if let Some(custom_extensions) = &self.custom_extensions {
+                    if !custom_extensions.contains(&real_ext_name.to_string()) {
+                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
+                    }
+                }
+            }
+        }
+
        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
+            Some(_ext_data) => {
+                // Construct the path to the extension archive
+                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+                //
+                // Keep it in sync with path generation in
+                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                let archive_path_str =
+                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                Ok((
+                    real_ext_name.to_string(),
+                    RemotePath::from_string(&archive_path_str)?,
+                ))
+            }
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -12,7 +12,6 @@ const_format.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
-hex.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true
 enum-map.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -1,63 +1,22 @@
-/// Types in this file are for pageserver's upward-facing API calls to the control plane
-use hex::FromHex;
+//! Types in this file are for pageserver's upward-facing API calls to the control plane,
+//! required for acquiring and validating tenant generation numbers.
+//!
+//! See docs/rfcs/025-generation-numbers.md
+
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

-/// TenantId's serialization is an array of u8, which is rather unfriendly
-/// for outside callers who aren't working with the native Rust TenantId.
-/// This class wraps it in serialization that is just the hex strict
-/// representation.
-#[derive(Eq, PartialEq, Clone, Hash)]
-pub struct HexTenantId(TenantId);
-
-impl HexTenantId {
-    pub fn new(t: TenantId) -> Self {
-        Self(t)
-    }
-
-    pub fn take(self) -> TenantId {
-        self.0
-    }
-}
-
-impl AsRef<TenantId> for HexTenantId {
-    fn as_ref(&self) -> &TenantId {
-        &self.0
-    }
-}
-
-impl Serialize for HexTenantId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        let hex = self.0.hex_encode();
-        serializer.collect_str(&hex)
-    }
-}
-
-impl<'de> Deserialize<'de> for HexTenantId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let string = String::deserialize(deserializer)?;
-        TenantId::from_hex(string)
-            .map(|t| HexTenantId::new(t))
-            .map_err(|e| serde::de::Error::custom(format!("{e}")))
-    }
-}
-
-// Top level s
-
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub generation: u32,
 }

@@ -66,9 +25,11 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub gen: u32,
 }

@@ -82,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    pub id: HexTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -201,6 +201,15 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
 impl std::ops::Deref for TenantCreateRequest {
    type Target = TenantConfig;

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,14 +13,13 @@ use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
-    path::{Path, PathBuf, StripPrefixError},
+    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
 };

 use anyhow::{bail, Context};

-use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;
@@ -45,34 +44,12 @@ pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

-// From the S3 spec
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
-
 /// Path on the remote storage, relative to some inner prefix.
 /// The prefix is an implementation detail, that allows representing local paths
 /// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl Serialize for RemotePath {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl<'de> Deserialize<'de> for RemotePath {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
-    }
-}
-
 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0.display())
@@ -111,15 +88,6 @@ impl RemotePath {
    pub fn extension(&self) -> Option<&str> {
        self.0.extension()?.to_str()
    }
-
-    /// Unwrap the PathBuf that RemotePath wraps
-    pub fn take(self) -> PathBuf {
-        self.0
-    }
-
-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, StripPrefixError> {
-        self.0.strip_prefix(&p.0)
-    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -198,8 +166,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// The client was shut down
-    Shutdown,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -211,7 +177,6 @@ impl std::fmt::Display for DownloadError {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Shutdown => write!(f, "Client shutting down"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
@@ -276,18 +241,6 @@ impl GenericRemoteStorage {
        }
    }

-    /// For small, simple downloads where caller doesn't want to handle the streaming: return the full body
-    pub async fn download_all(&self, from: &RemotePath) -> Result<Vec<u8>, DownloadError> {
-        let mut download = self.download(from).await?;
-
-        let mut bytes = Vec::new();
-        tokio::io::copy(&mut download.download_stream, &mut bytes)
-            .await
-            .with_context(|| format!("Failed to download body from {from}"))
-            .map_err(DownloadError::Other)?;
-        Ok(bytes)
-    }
-
    pub async fn download_byte_range(
        &self,
        from: &RemotePath,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -155,18 +155,20 @@ impl RemoteStorage for LocalFs {
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
        match fs::metadata(full_path.clone()).await {
-            Err(e) => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                if matches!(e.kind(), ErrorKind::NotFound) {
-                    initial_dir.pop();
-                }
-            }
            Ok(meta) => {
                if !meta.is_dir() {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
            }
+            Err(e) if e.kind() == ErrorKind::NotFound => {
+                // It's not a file that exists: strip the prefix back to the parent directory
+                initial_dir.pop();
+            }
+            Err(e) => {
+                // Unexpected I/O error
+                anyhow::bail!(e)
+            }
        }

        // Note that PathBuf starts_with only considers full path segments, but
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -22,7 +22,7 @@ use aws_sdk_s3::{
    Client,
 };
 use aws_smithy_http::body::SdkBody;
-use hyper::{Body, StatusCode};
+use hyper::Body;
 use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
@@ -529,16 +529,7 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    if let Some(r) = e.raw_response() {
-                        if r.http().status() == StatusCode::NOT_FOUND {
-                            // 404 is acceptable for deletions.  AWS S3 does not return this, but
-                            // some other implementations might (e.g. GCS XML API returns 404 on DeleteObject
-                            // to a missing key)
-                            continue;
-                        } else {
-                            return Err(anyhow::format_err!("DeleteObjects response error: {e}"));
-                        }
-                    }
+                    return Err(e.into());
                }
            }
        }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -1,8 +1,13 @@
-use std::fmt::Display;
+use std::fmt::Debug;

 use serde::{Deserialize, Serialize};

-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+/// Tenant generations are used to provide split-brain safety and allow
+/// multiple pageservers to attach the same tenant concurrently.
+///
+/// See docs/rfcs/025-generation-numbers.md for detail on how generation
+/// numbers are used.
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
@@ -48,6 +53,7 @@ impl Generation {
        matches!(self, Self::None)
    }

+    #[track_caller]
    pub fn get_suffix(&self) -> String {
        match self {
            Self::Valid(v) => {
@@ -60,19 +66,27 @@ impl Generation {
        }
    }

-    pub fn previous(&self) -> Self {
-        if let Self::Valid(v) = self {
-            Self::new(v - 1)
-        } else {
-            Self::none()
-        }
+    /// `suffix` is the part after "-" in a key
+    ///
+    /// Returns None if parsing was unsuccessful
+    pub fn parse_suffix(suffix: &str) -> Option<Generation> {
+        u32::from_str_radix(suffix, 16).map(Generation::new).ok()
    }

-    pub fn into(self) -> Option<u32> {
-        if let Self::Valid(v) = self {
-            Some(v)
-        } else {
-            None
+    #[track_caller]
+    pub fn previous(&self) -> Generation {
+        match self {
+            Self::Valid(n) => {
+                if *n == 0 {
+                    // Since a tenant may be upgraded from a pre-generations state, interpret the "previous" generation
+                    // to 0 as being "no generation".
+                    Self::None
+                } else {
+                    Self::Valid(n - 1)
+                }
+            }
+            Self::None => Self::None,
+            Self::Broken => panic!("Attempted to use a broken generation"),
        }
    }
 }
@@ -89,7 +103,7 @@ impl Serialize for Generation {
            // that include an optional generation should convert None to an
            // Option<Generation>::None
            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation",
+                "Tried to serialize invalid generation ({self})",
            ))
        }
    }
@@ -104,7 +118,10 @@ impl<'de> Deserialize<'de> for Generation {
    }
 }

-impl Display for Generation {
+// We intentionally do not implement Display for Generation, to reduce the
+// risk of a bug where the generation is used in a format!() string directly
+// instead of using get_suffix().
+impl Debug for Generation {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Valid(v) => {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,9 +24,6 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

-    #[error("Shutting down")]
-    ShuttingDown,
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -55,10 +52,6 @@ impl ApiError {
                self.to_string(),
                StatusCode::PRECONDITION_FAILED,
            ),
-            ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status(
-                "Shutting down".to_string(),
-                StatusCode::SERVICE_UNAVAILABLE,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -50,7 +50,7 @@ impl Id {
        Id::from(tli_buf)
    }

-    pub fn hex_encode(&self) -> String {
+    fn hex_encode(&self) -> String {
        static HEX: &[u8] = b"0123456789abcdef";

        let mut buf = vec![0u8; self.0.len() * 2];
@@ -133,10 +133,6 @@ macro_rules! id_newtype {
            pub const fn from_array(b: [u8; 16]) -> Self {
                $t(Id(b))
            }
-
-            pub fn hex_encode(&self) -> String {
-                self.0.hex_encode()
-            }
        }

        impl FromStr for $t {
@@ -248,13 +244,13 @@ id_newtype!(TenantId);
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
 /// See [`Id`] for alternative ways to serialize it.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct ConnectionId(Id);

 id_newtype!(ConnectionId);

 // A pair uniquely identifying Neon instance.
-#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct TenantTimelineId {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
@@ -277,36 +273,6 @@ impl TenantTimelineId {
    }
 }

-impl Serialize for TenantTimelineId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        serializer.collect_str(self)
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantTimelineId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let str = String::deserialize(deserializer)?;
-        if let Some((tenant_part, timeline_part)) = str.split_once('/') {
-            Ok(Self {
-                tenant_id: TenantId(Id::from_hex(tenant_part).map_err(|e| {
-                    serde::de::Error::custom(format!("Malformed tenant in TenantTimelineId: {e}"))
-                })?),
-                timeline_id: TimelineId(Id::from_hex(timeline_part).map_err(|e| {
-                    serde::de::Error::custom(format!("Malformed timeline in TenantTimelineId {e}"))
-                })?),
-            })
-        } else {
-            Err(serde::de::Error::custom("Malformed TenantTimelineId"))
-        }
-    }
-}
-
 impl fmt::Display for TenantTimelineId {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "{}/{}", self.tenant_id, self.timeline_id)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,6 +3,7 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -96,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -142,12 +143,12 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
    let mut total_excess_layers = 0usize;
-    for tenant in fs::read_dir(storage_path.join("tenants"))? {
+    for tenant in fs::read_dir(storage_path.join(TENANTS_SEGMENT_NAME))? {
        let tenant = tenant?;
        if !tenant.file_type()?.is_dir() {
            continue;
        }
-        for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+        for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
            let timeline = timeline?;
            if !timeline.file_type()?.is_dir() {
                continue;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,6 +5,7 @@ use clap::Subcommand;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
    repository::{Key, KEY_SIZE},
@@ -47,7 +48,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
@@ -68,7 +69,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            },
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader_virtual(&file);
+    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
@@ -80,13 +81,13 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
    match cmd {
        LayerCmd::List { path } => {
-            for tenant in fs::read_dir(path.join("tenants"))? {
+            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
                let tenant = tenant?;
                if !tenant.file_type()?.is_dir() {
                    continue;
                }
                println!("tenant {}", tenant.file_name().to_string_lossy());
-                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                for timeline in fs::read_dir(tenant.path().join(TIMELINES_SEGMENT_NAME))? {
                    let timeline = timeline?;
                    if !timeline.file_type()?.is_dir() {
                        continue;
@@ -101,9 +102,9 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            timeline,
        } => {
            let timeline_path = path
-                .join("tenants")
+                .join(TENANTS_SEGMENT_NAME)
                .join(tenant)
-                .join("timelines")
+                .join(TIMELINES_SEGMENT_NAME)
                .join(timeline);
            let mut idx = 0;
            for layer in fs::read_dir(timeline_path)? {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,14 +2,12 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::time::Duration;
 use std::{env, ops::ControlFlow, path::Path, str::FromStr};

 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
-use pageserver::deletion_queue::{DeletionQueue, DeletionQueueError};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
@@ -351,35 +349,6 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Set up deletion queue
-    let deletion_queue_cancel = tokio_util::sync::CancellationToken::new();
-    let (deletion_queue, deletion_frontend, deletion_backend, deletion_executor) =
-        DeletionQueue::new(remote_storage.clone(), conf, deletion_queue_cancel.clone());
-    if let Some(mut deletion_frontend) = deletion_frontend {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_frontend
-                .background()
-                .instrument(info_span!(parent:None, "deletion frontend"))
-                .await
-        });
-    }
-    if let Some(mut deletion_backend) = deletion_backend {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_backend
-                .background()
-                .instrument(info_span!(parent: None, "deletion backend"))
-                .await
-        });
-    }
-    if let Some(mut deletion_executor) = deletion_executor {
-        BACKGROUND_RUNTIME.spawn(async move {
-            deletion_executor
-                .background()
-                .instrument(info_span!(parent: None, "deletion executor"))
-                .await
-        });
-    }
-
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint("initial", "Starting loading tenants");
@@ -417,9 +386,9 @@ fn start_pageserver(
        TenantSharedResources {
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
-            deletion_queue_client: deletion_queue.new_client(),
        },
        order,
+        shutdown_pageserver.clone(),
    ))?;

    BACKGROUND_RUNTIME.spawn({
@@ -514,7 +483,6 @@ fn start_pageserver(
            http_auth,
            broker_client.clone(),
            remote_storage,
-            deletion_queue.clone(),
            disk_usage_eviction_state,
        )?
        .build()
@@ -637,36 +605,6 @@ fn start_pageserver(
            // The plan is to change that over time.
            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
-
-            // Best effort to persist any outstanding deletions, to avoid leaking objects
-            let dq = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(async move {
-                match tokio::time::timeout(Duration::from_secs(5), dq.new_client().flush()).await {
-                    Ok(flush_r) => {
-                        match flush_r {
-                            Ok(()) => {
-                                info!("Deletion queue flushed successfully on shutdown")
-                            }
-                            Err(e) => {
-                                match e {
-                                    DeletionQueueError::ShuttingDown => {
-                                        // This is not harmful for correctness, but is unexpected: the deletion
-                                        // queue's workers should stay alive as long as there are any client handles instantiated.
-                                        warn!("Deletion queue stopped prematurely");
-                                    }
-                                }
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        warn!("Timed out flushing deletion queue on shutdown ({e})")
-                    }
-                }
-            });
-
-            // Clean shutdown of deletion queue workers
-            deletion_queue_cancel.cancel();
-
            unreachable!()
        }
    })
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -32,7 +32,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
@@ -576,28 +577,7 @@ impl PageServerConf {
    //

    pub fn tenants_path(&self) -> PathBuf {
-        self.workdir.join("tenants")
-    }
-
-    pub fn deletion_prefix(&self) -> PathBuf {
-        self.workdir.join("deletion")
-    }
-
-    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix()
-            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
-    }
-
-    pub fn deletion_header_path(&self) -> PathBuf {
-        // Encode a version in the filename, so that if we ever switch away from JSON we can
-        // increment this.
-        const VERSION: u8 = 1;
-
-        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
+        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -0,0 +1,119 @@
+use std::collections::HashMap;
+
+use hyper::StatusCode;
+use pageserver_api::control_api::{ReAttachRequest, ReAttachResponse};
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{
+    backoff,
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::config::PageServerConf;
+
+// Backoffs when control plane requests do not succeed: compromise between reducing load
+// on control plane, and retrying frequently when we are blocked on a control plane
+// response to make progress.
+const BACKOFF_INCREMENT: f64 = 0.1;
+const BACKOFF_MAX: f64 = 10.0;
+
+/// The Pageserver's client for using the control plane API: this is a small subset
+/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
+pub(crate) struct ControlPlaneClient {
+    http_client: reqwest::Client,
+    base_url: Url,
+    node_id: NodeId,
+    cancel: CancellationToken,
+}
+
+impl ControlPlaneClient {
+    /// A None return value indicates that the input `conf` object does not have control
+    /// plane API enabled.
+    pub(crate) fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+        let mut url = match conf.control_plane_api.as_ref() {
+            Some(u) => u.clone(),
+            None => return None,
+        };
+
+        if let Ok(mut segs) = url.path_segments_mut() {
+            // This ensures that `url` ends with a slash if it doesn't already.
+            // That way, we can subsequently use join() to safely attach extra path elements.
+            segs.pop_if_empty().push("");
+        }
+
+        let client = reqwest::ClientBuilder::new()
+            .build()
+            .expect("Failed to construct http client");
+
+        Some(Self {
+            http_client: client,
+            base_url: url,
+            node_id: conf.id,
+            cancel: cancel.clone(),
+        })
+    }
+
+    async fn try_re_attach(
+        &self,
+        url: Url,
+        request: &ReAttachRequest,
+    ) -> anyhow::Result<ReAttachResponse> {
+        match self.http_client.post(url).json(request).send().await {
+            Err(e) => Err(anyhow::Error::from(e)),
+            Ok(r) => {
+                if r.status() == StatusCode::OK {
+                    r.json::<ReAttachResponse>()
+                        .await
+                        .map_err(anyhow::Error::from)
+                } else {
+                    Err(anyhow::anyhow!("Unexpected status {}", r.status()))
+                }
+            }
+        }
+    }
+
+    /// Block until we get a successful response
+    pub(crate) async fn re_attach(&self) -> anyhow::Result<HashMap<TenantId, Generation>> {
+        let re_attach_path = self
+            .base_url
+            .join("re-attach")
+            .expect("Failed to build re-attach path");
+        let request = ReAttachRequest {
+            node_id: self.node_id,
+        };
+
+        let mut attempt = 0;
+        loop {
+            let result = self.try_re_attach(re_attach_path.clone(), &request).await;
+            match result {
+                Ok(res) => {
+                    tracing::info!(
+                        "Received re-attach response with {} tenants",
+                        res.tenants.len()
+                    );
+
+                    return Ok(res
+                        .tenants
+                        .into_iter()
+                        .map(|t| (t.id, Generation::new(t.generation)))
+                        .collect::<HashMap<_, _>>());
+                }
+                Err(e) => {
+                    tracing::error!("Error re-attaching tenants, retrying: {e:#}");
+                    backoff::exponential_backoff(
+                        attempt,
+                        BACKOFF_INCREMENT,
+                        BACKOFF_MAX,
+                        &self.cancel,
+                    )
+                    .await;
+                    if self.cancel.is_cancelled() {
+                        return Err(anyhow::anyhow!("Shutting down"));
+                    }
+                    attempt += 1;
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1,850 +0,0 @@
-mod backend;
-mod executor;
-mod frontend;
-
-use std::collections::HashMap;
-use std::path::PathBuf;
-
-use crate::metrics::DELETION_QUEUE_SUBMITTED;
-use crate::tenant::remote_timeline_client::remote_timeline_path;
-use remote_storage::{GenericRemoteStorage, RemotePath};
-use serde::Deserialize;
-use serde::Serialize;
-use serde_with::serde_as;
-use thiserror::Error;
-use tokio;
-use tokio_util::sync::CancellationToken;
-use tracing::{self, debug, error};
-use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
-
-pub(crate) use self::backend::BackendQueueWorker;
-use self::executor::ExecutorWorker;
-use self::frontend::DeletionOp;
-pub(crate) use self::frontend::FrontendQueueWorker;
-use backend::BackendQueueMessage;
-use executor::ExecutorMessage;
-use frontend::FrontendQueueMessage;
-
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
-
-// TODO: adminstrative "panic button" config property to disable all deletions
-// TODO: configurable for how long to wait before executing deletions
-
-/// We aggregate object deletions from many tenants in one place, for several reasons:
-/// - Coalesce deletions into fewer DeleteObjects calls
-/// - Enable Tenant/Timeline lifetimes to be shorter than the time it takes
-///   to flush any outstanding deletions.
-/// - Globally control throughput of deletions, as these are a low priority task: do
-///   not compete with the same S3 clients/connections used for higher priority uploads.
-/// - Future: enable validating that we may do deletions in a multi-attached scenario,
-///   via generation numbers (see https://github.com/neondatabase/neon/pull/4919)
-///
-/// There are two kinds of deletion: deferred and immediate.  A deferred deletion
-/// may be intentionally delayed to protect passive readers of S3 data, and may
-/// be subject to a generation number validation step.  An immediate deletion is
-/// ready to execute immediately, and is only queued up so that it can be coalesced
-/// with other deletions in flight.
-///
-/// Deferred deletions pass through three steps:
-/// - Frontend: accumulate deletion requests from Timelines, and batch them up into
-///   DeletionLists, which are persisted to S3.
-/// - Backend: accumulate deletion lists, and validate them en-masse prior to passing
-///   the keys in the list onward for actual deletion
-/// - Executor: accumulate object keys that the backend has validated for immediate
-///   deletion, and execute them in batches of 1000 keys via DeleteObjects.
-///
-/// Non-deferred deletions, such as during timeline deletion, bypass the first
-/// two stages and are passed straight into the Executor.
-///
-/// Internally, each stage is joined by a channel to the next.  In S3, there is only
-/// one queue (of DeletionLists), which is written by the frontend and consumed
-/// by the backend.
-#[derive(Clone)]
-pub struct DeletionQueue {
-    client: DeletionQueueClient,
-}
-
-#[derive(Debug)]
-struct FlushOp {
-    tx: tokio::sync::oneshot::Sender<()>,
-}
-
-impl FlushOp {
-    fn fire(self) {
-        if self.tx.send(()).is_err() {
-            // oneshot channel closed. This is legal: a client could be destroyed while waiting for a flush.
-            debug!("deletion queue flush from dropped client");
-        };
-    }
-}
-
-#[derive(Clone)]
-pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
-    executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct TenantDeletionList {
-    /// For each Timeline, a list of key fragments to append to the timeline remote path
-    /// when reconstructing a full key
-    timelines: HashMap<TimelineId, Vec<String>>,
-
-    /// The generation in which this deletion was emitted: note that this may not be the
-    /// same as the generation of any layers being deleted.  The generation of the layer
-    /// has already been absorbed into the keys in `objects`
-    generation: Generation,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize)]
-struct DeletionList {
-    /// Serialization version, for future use
-    version: u8,
-
-    /// Used for constructing a unique key for each deletion list we write out.
-    sequence: u64,
-
-    /// To avoid repeating tenant/timeline IDs in every key, we store keys in
-    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
-    /// with one unique generation ID: if someone tries to push a second generation
-    /// ID for the same tenant, we will start a new DeletionList.
-    tenants: HashMap<TenantId, TenantDeletionList>,
-
-    /// Avoid having to walk `tenants` to calculate size
-    size: usize,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize)]
-struct DeletionHeader {
-    /// Serialization version, for future use
-    version: u8,
-
-    /// Enable determining the next sequence number even if there are no deletion lists present.
-    /// If there _are_ deletion lists present, then their sequence numbers take precedence over
-    /// this.
-    last_deleted_list_seq: u64,
-    // TODO: this is where we will track a 'clean' sequence number that indicates all deletion
-    // lists <= that sequence have had their generations validated with the control plane
-    // and are OK to execute.
-}
-
-impl DeletionHeader {
-    const VERSION_LATEST: u8 = 1;
-
-    fn new(last_deleted_list_seq: u64) -> Self {
-        Self {
-            version: Self::VERSION_LATEST,
-            last_deleted_list_seq,
-        }
-    }
-}
-
-impl DeletionList {
-    const VERSION_LATEST: u8 = 1;
-    fn new(sequence: u64) -> Self {
-        Self {
-            version: Self::VERSION_LATEST,
-            sequence,
-            tenants: HashMap::new(),
-            size: 0,
-        }
-    }
-
-    fn drain(&mut self) -> Self {
-        let mut tenants = HashMap::new();
-        std::mem::swap(&mut self.tenants, &mut tenants);
-        let other = Self {
-            version: Self::VERSION_LATEST,
-            sequence: self.sequence,
-            tenants,
-            size: self.size,
-        };
-        self.size = 0;
-        other
-    }
-
-    fn is_empty(&self) -> bool {
-        self.tenants.is_empty()
-    }
-
-    fn len(&self) -> usize {
-        self.size
-    }
-
-    /// Returns true if the push was accepted, false if the caller must start a new
-    /// deletion list.
-    fn push(
-        &mut self,
-        tenant: &TenantId,
-        timeline: &TimelineId,
-        generation: Generation,
-        objects: &mut Vec<RemotePath>,
-    ) -> bool {
-        if objects.is_empty() {
-            // Avoid inserting an empty TimelineDeletionList: this preserves the property
-            // that if we have no keys, then self.objects is empty (used in Self::is_empty)
-            return true;
-        }
-
-        let tenant_entry = self
-            .tenants
-            .entry(*tenant)
-            .or_insert_with(|| TenantDeletionList {
-                timelines: HashMap::new(),
-                generation: generation,
-            });
-
-        if tenant_entry.generation != generation {
-            // Only one generation per tenant per list: signal to
-            // caller to start a new list.
-            return false;
-        }
-
-        let timeline_entry = tenant_entry
-            .timelines
-            .entry(*timeline)
-            .or_insert_with(|| Vec::new());
-
-        let timeline_remote_path = remote_timeline_path(tenant, timeline);
-
-        self.size += objects.len();
-        timeline_entry.extend(objects.drain(..).map(|p| {
-            p.strip_prefix(&timeline_remote_path)
-                .expect("Timeline paths always start with the timeline prefix")
-                .to_string_lossy()
-                .to_string()
-        }));
-        true
-    }
-
-    fn take_paths(self) -> Vec<RemotePath> {
-        let mut result = Vec::new();
-        for (tenant, tenant_deletions) in self.tenants.into_iter() {
-            for (timeline, timeline_layers) in tenant_deletions.timelines.into_iter() {
-                let timeline_remote_path = remote_timeline_path(&tenant, &timeline);
-                result.extend(
-                    timeline_layers
-                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
-                );
-            }
-        }
-
-        result
-    }
-}
-
-#[derive(Error, Debug)]
-pub enum DeletionQueueError {
-    #[error("Deletion queue unavailable during shutdown")]
-    ShuttingDown,
-}
-
-impl DeletionQueueClient {
-    async fn do_push(&self, msg: FrontendQueueMessage) -> Result<(), DeletionQueueError> {
-        match self.tx.send(msg).await {
-            Ok(_) => Ok(()),
-            Err(e) => {
-                // This shouldn't happen, we should shut down all tenants before
-                // we shut down the global delete queue.  If we encounter a bug like this,
-                // we may leak objects as deletions won't be processed.
-                error!("Deletion queue closed while pushing, shutting down? ({e})");
-                Err(DeletionQueueError::ShuttingDown)
-            }
-        }
-    }
-
-    /// Submit a list of layers for deletion: this function will return before the deletion is
-    /// persistent, but it may be executed at any time after this function enters: do not push
-    /// layers until you're sure they can be deleted safely (i.e. remote metadata no longer
-    /// references them).
-    pub(crate) async fn push_layers(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> Result<(), DeletionQueueError> {
-        DELETION_QUEUE_SUBMITTED.inc_by(layers.len() as u64);
-        self.do_push(FrontendQueueMessage::Delete(DeletionOp {
-            tenant_id,
-            timeline_id,
-            layers,
-            generation,
-            objects: Vec::new(),
-        }))
-        .await
-    }
-
-    async fn do_flush(
-        &self,
-        msg: FrontendQueueMessage,
-        rx: tokio::sync::oneshot::Receiver<()>,
-    ) -> Result<(), DeletionQueueError> {
-        self.do_push(msg).await?;
-        if rx.await.is_err() {
-            // This shouldn't happen if tenants are shut down before deletion queue.  If we
-            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
-            // when it hasn't, possibly leading to leaking objects.
-            error!("Deletion queue dropped flush op while client was still waiting");
-            Err(DeletionQueueError::ShuttingDown)
-        } else {
-            Ok(())
-        }
-    }
-
-    /// Wait until all previous deletions are persistent (either executed, or written to a DeletionList)
-    pub async fn flush(&self) -> Result<(), DeletionQueueError> {
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        self.do_flush(FrontendQueueMessage::Flush(FlushOp { tx }), rx)
-            .await
-    }
-
-    // Wait until all previous deletions are executed
-    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
-        debug!("flush_execute: flushing to deletion lists...");
-        // Flush any buffered work to deletion lists
-        self.flush().await?;
-
-        // Flush execution of deletion lists
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        debug!("flush_execute: flushing execution...");
-        self.do_flush(FrontendQueueMessage::FlushExecute(FlushOp { tx }), rx)
-            .await?;
-        debug!("flush_execute: finished flushing execution...");
-        Ok(())
-    }
-
-    /// This interface bypasses the persistent deletion queue, and any validation
-    /// that this pageserver is still elegible to execute the deletions.  It is for
-    /// use in timeline deletions, where the control plane is telling us we may
-    /// delete everything in the timeline.
-    ///
-    /// DO NOT USE THIS FROM GC OR COMPACTION CODE.  Use the regular `push_layers`.
-    pub(crate) async fn push_immediate(
-        &self,
-        objects: Vec<RemotePath>,
-    ) -> Result<(), DeletionQueueError> {
-        self.executor_tx
-            .send(ExecutorMessage::Delete(objects))
-            .await
-            .map_err(|_| DeletionQueueError::ShuttingDown)
-    }
-
-    /// Companion to push_immediate.  When this returns Ok, all prior objects sent
-    /// into push_immediate have been deleted from remote storage.
-    pub(crate) async fn flush_immediate(&self) -> Result<(), DeletionQueueError> {
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        self.executor_tx
-            .send(ExecutorMessage::Flush(FlushOp { tx }))
-            .await
-            .map_err(|_| DeletionQueueError::ShuttingDown)?;
-
-        rx.await.map_err(|_| DeletionQueueError::ShuttingDown)
-    }
-}
-
-impl DeletionQueue {
-    pub fn new_client(&self) -> DeletionQueueClient {
-        self.client.clone()
-    }
-
-    /// Caller may use the returned object to construct clients with new_client.
-    /// Caller should tokio::spawn the background() members of the two worker objects returned:
-    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
-    ///
-    /// If remote_storage is None, then the returned workers will also be None.
-    pub fn new(
-        remote_storage: Option<GenericRemoteStorage>,
-        conf: &'static PageServerConf,
-        cancel: CancellationToken,
-    ) -> (
-        Self,
-        Option<FrontendQueueWorker>,
-        Option<BackendQueueWorker>,
-        Option<ExecutorWorker>,
-    ) {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
-
-        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
-        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
-
-        // Shallow channel: it carries lists of paths, and we expect the main queueing to
-        // happen in the backend (persistent), not in this queue.
-        let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16);
-
-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient { tx, executor_tx },
-                    },
-                    None,
-                    None,
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
-        (
-            Self {
-                client: DeletionQueueClient {
-                    tx,
-                    executor_tx: executor_tx.clone(),
-                },
-            },
-            Some(FrontendQueueWorker::new(
-                conf,
-                rx,
-                backend_tx,
-                cancel.clone(),
-            )),
-            Some(BackendQueueWorker::new(
-                conf,
-                backend_rx,
-                executor_tx,
-                cancel.clone(),
-            )),
-            Some(ExecutorWorker::new(
-                remote_storage,
-                executor_rx,
-                cancel.clone(),
-            )),
-        )
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use hex_literal::hex;
-    use std::{
-        io::ErrorKind,
-        path::{Path, PathBuf},
-    };
-    use tracing::info;
-
-    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use tokio::{runtime::EnterGuard, task::JoinHandle};
-
-    use crate::tenant::{harness::TenantHarness, remote_timeline_client::remote_timeline_path};
-
-    use super::*;
-    pub const TIMELINE_ID: TimelineId =
-        TimelineId::from_array(hex!("11223344556677881122334455667788"));
-
-    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        _entered_runtime: EnterGuard<'static>,
-        harness: TenantHarness,
-        remote_fs_dir: PathBuf,
-        storage: GenericRemoteStorage,
-        deletion_queue: DeletionQueue,
-        fe_worker: JoinHandle<()>,
-        be_worker: JoinHandle<()>,
-        ex_worker: JoinHandle<()>,
-    }
-
-    impl TestSetup {
-        /// Simulate a pageserver restart by destroying and recreating the deletion queue
-        fn restart(&mut self) {
-            let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
-                Some(self.storage.clone()),
-                self.harness.conf,
-                CancellationToken::new(),
-            );
-
-            self.deletion_queue = deletion_queue;
-
-            let mut fe_worker = fe_worker.unwrap();
-            let mut be_worker = be_worker.unwrap();
-            let mut ex_worker = ex_worker.unwrap();
-            let mut fe_worker = self
-                .runtime
-                .spawn(async move { fe_worker.background().await });
-            let mut be_worker = self
-                .runtime
-                .spawn(async move { be_worker.background().await });
-            let mut ex_worker = self.runtime.spawn(async move {
-                drop(ex_worker.background().await);
-            });
-            std::mem::swap(&mut self.fe_worker, &mut fe_worker);
-            std::mem::swap(&mut self.be_worker, &mut be_worker);
-            std::mem::swap(&mut self.ex_worker, &mut ex_worker);
-
-            // Join the old workers
-            self.runtime.block_on(fe_worker).unwrap();
-            self.runtime.block_on(be_worker).unwrap();
-            self.runtime.block_on(ex_worker).unwrap();
-        }
-    }
-
-    fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
-        let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
-        let harness = TenantHarness::create(test_name)?;
-
-        // We do not load() the harness: we only need its config and remote_storage
-
-        // Set up a GenericRemoteStorage targetting a directory
-        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-        };
-        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
-
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let entered_runtime = runtime.enter();
-
-        let (deletion_queue, fe_worker, be_worker, ex_worker) = DeletionQueue::new(
-            Some(storage.clone()),
-            harness.conf,
-            CancellationToken::new(),
-        );
-
-        let mut fe_worker = fe_worker.unwrap();
-        let mut be_worker = be_worker.unwrap();
-        let mut ex_worker = ex_worker.unwrap();
-        let fe_worker_join = runtime.spawn(async move { fe_worker.background().await });
-        let be_worker_join = runtime.spawn(async move { be_worker.background().await });
-        let ex_worker_join = runtime.spawn(async move {
-            drop(ex_worker.background().await);
-        });
-
-        Ok(TestSetup {
-            runtime,
-            _entered_runtime: entered_runtime,
-            harness,
-            remote_fs_dir,
-            storage,
-            deletion_queue,
-            fe_worker: fe_worker_join,
-            be_worker: be_worker_join,
-            ex_worker: ex_worker_join,
-        })
-    }
-
-    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
-    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
-        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
-        expected.sort();
-
-        let mut found: Vec<String> = Vec::new();
-        let dir = match std::fs::read_dir(remote_path) {
-            Ok(d) => d,
-            Err(e) => {
-                if e.kind() == ErrorKind::NotFound {
-                    if expected.is_empty() {
-                        // We are asserting prefix is empty: it is expected that the dir is missing
-                        return;
-                    } else {
-                        assert_eq!(expected, Vec::<String>::new());
-                        unreachable!();
-                    }
-                } else {
-                    panic!(
-                        "Unexpected error listing {0}: {e}",
-                        remote_path.to_string_lossy()
-                    );
-                }
-            }
-        };
-
-        for entry in dir.flatten() {
-            let entry_name = entry.file_name();
-            let fname = entry_name.to_str().unwrap();
-            found.push(String::from(fname));
-        }
-        found.sort();
-
-        assert_eq!(expected, found);
-    }
-
-    fn assert_local_files(expected: &[&str], directory: &Path) {
-        let mut dir = match std::fs::read_dir(directory) {
-            Ok(d) => d,
-            Err(_) => {
-                assert_eq!(expected, &Vec::<String>::new());
-                return;
-            }
-        };
-        let mut found = Vec::new();
-        while let Some(dentry) = dir.next() {
-            let dentry = dentry.unwrap();
-            let file_name = dentry.file_name();
-            let file_name_str = file_name.to_string_lossy();
-            found.push(file_name_str.to_string());
-        }
-        found.sort();
-        assert_eq!(expected, found);
-    }
-
-    #[test]
-    fn deletion_queue_smoke() -> anyhow::Result<()> {
-        // Basic test that the deletion queue processes the deletions we pass into it
-        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
-        let client = ctx.deletion_queue.new_client();
-
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
-
-        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
-        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
-        let deletion_prefix = ctx.harness.conf.deletion_prefix();
-
-        // Exercise the distinction between the generation of the layers
-        // we delete, and the generation of the running Tenant.
-        let layer_generation = Generation::new(0xdeadbeef);
-        let now_generation = Generation::new(0xfeedbeef);
-
-        let remote_layer_file_name_1 =
-            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
-
-        // Inject a victim file to remote storage
-        info!("Writing");
-        std::fs::create_dir_all(&remote_timeline_path)?;
-        std::fs::write(
-            remote_timeline_path.join(remote_layer_file_name_1.clone()),
-            content,
-        )?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-
-        // File should still be there after we push it to the queue (we haven't pushed enough to flush anything)
-        info!("Pushing");
-        ctx.runtime.block_on(client.push_layers(
-            tenant_id,
-            TIMELINE_ID,
-            now_generation,
-            [(layer_file_name_1.clone(), layer_generation)].to_vec(),
-        ))?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-
-        assert_local_files(&[], &deletion_prefix);
-
-        // File should still be there after we write a deletion list (we haven't pushed enough to execute anything)
-        info!("Flushing");
-        ctx.runtime.block_on(client.flush())?;
-        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
-        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
-
-        // File should go away when we execute
-        info!("Flush-executing");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_remote_files(&[], &remote_timeline_path);
-        assert_local_files(&["header-01"], &deletion_prefix);
-
-        // Flushing on an empty queue should succeed immediately, and not write any lists
-        info!("Flush-executing on empty");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_local_files(&["header-01"], &deletion_prefix);
-
-        Ok(())
-    }
-
-    #[test]
-    fn deletion_queue_recovery() -> anyhow::Result<()> {
-        // Basic test that the deletion queue processes the deletions we pass into it
-        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
-        let client = ctx.deletion_queue.new_client();
-
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
-
-        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
-        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
-        let deletion_prefix = ctx.harness.conf.deletion_prefix();
-        let layer_generation = Generation::new(0xdeadbeef);
-        let now_generation = Generation::new(0xfeedbeef);
-        let remote_layer_file_name_1 =
-            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
-
-        // Inject a file, delete it, and flush to a deletion list
-        std::fs::create_dir_all(&remote_timeline_path)?;
-        std::fs::write(
-            remote_timeline_path.join(remote_layer_file_name_1.clone()),
-            content,
-        )?;
-        ctx.runtime.block_on(client.push_layers(
-            tenant_id,
-            TIMELINE_ID,
-            now_generation,
-            [(layer_file_name_1.clone(), layer_generation)].to_vec(),
-        ))?;
-        ctx.runtime.block_on(client.flush())?;
-        assert_local_files(&["0000000000000001-01.list"], &deletion_prefix);
-
-        // Restart the deletion queue
-        drop(client);
-        ctx.restart();
-        let client = ctx.deletion_queue.new_client();
-
-        // If we have recovered the deletion list properly, then executing after restart should purge it
-        info!("Flush-executing");
-        ctx.runtime.block_on(client.flush_execute())?;
-        assert_remote_files(&[], &remote_timeline_path);
-        assert_local_files(&["header-01"], &deletion_prefix);
-        Ok(())
-    }
-}
-
-/// A lightweight queue which can issue ordinary DeletionQueueClient objects, but doesn't do any persistence
-/// or coalescing, and doesn't actually execute any deletions unless you call pump() to kick it.
-#[cfg(test)]
-pub mod mock {
-    use tracing::info;
-
-    use crate::tenant::remote_timeline_client::remote_layer_path;
-
-    use super::*;
-    use std::sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    };
-
-    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<FrontendQueueMessage>,
-        executor_tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-        tx_pump: tokio::sync::mpsc::Sender<FlushOp>,
-        executed: Arc<AtomicUsize>,
-    }
-
-    impl MockDeletionQueue {
-        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, mut rx) = tokio::sync::mpsc::channel(16384);
-            let (tx_pump, mut rx_pump) = tokio::sync::mpsc::channel::<FlushOp>(1);
-            let (executor_tx, mut executor_rx) = tokio::sync::mpsc::channel(16384);
-
-            let executed = Arc::new(AtomicUsize::new(0));
-            let executed_bg = executed.clone();
-
-            tokio::spawn(async move {
-                let remote_storage = match &remote_storage {
-                    Some(rs) => rs,
-                    None => {
-                        info!("No remote storage configured, deletion queue will not run");
-                        return;
-                    }
-                };
-                info!("Running mock deletion queue");
-                // Each time we are asked to pump, drain the queue of deletions
-                while let Some(flush_op) = rx_pump.recv().await {
-                    info!("Executing all pending deletions");
-
-                    // Transform all executor messages to generic frontend messages
-                    while let Ok(msg) = executor_rx.try_recv() {
-                        match msg {
-                            ExecutorMessage::Delete(objects) => {
-                                for path in objects {
-                                    match remote_storage.delete(&path).await {
-                                        Ok(_) => {
-                                            debug!("Deleted {path}");
-                                        }
-                                        Err(e) => {
-                                            error!(
-                                                "Failed to delete {path}, leaking object! ({e})"
-                                            );
-                                        }
-                                    }
-                                    executed_bg.fetch_add(1, Ordering::Relaxed);
-                                }
-                            }
-                            ExecutorMessage::Flush(flush_op) => {
-                                flush_op.fire();
-                            }
-                        }
-                    }
-
-                    while let Ok(msg) = rx.try_recv() {
-                        match msg {
-                            FrontendQueueMessage::Delete(op) => {
-                                let mut objects = op.objects;
-                                for (layer, generation) in op.layers {
-                                    objects.push(remote_layer_path(
-                                        &op.tenant_id,
-                                        &op.timeline_id,
-                                        &layer,
-                                        generation,
-                                    ));
-                                }
-
-                                for path in objects {
-                                    info!("Executing deletion {path}");
-                                    match remote_storage.delete(&path).await {
-                                        Ok(_) => {
-                                            debug!("Deleted {path}");
-                                        }
-                                        Err(e) => {
-                                            error!(
-                                                "Failed to delete {path}, leaking object! ({e})"
-                                            );
-                                        }
-                                    }
-                                    executed_bg.fetch_add(1, Ordering::Relaxed);
-                                }
-                            }
-                            FrontendQueueMessage::Flush(op) => {
-                                op.fire();
-                            }
-                            FrontendQueueMessage::FlushExecute(op) => {
-                                // We have already executed all prior deletions because mock does them inline
-                                op.fire();
-                            }
-                        }
-                        info!("All pending deletions have been executed");
-                    }
-                    flush_op
-                        .tx
-                        .send(())
-                        .expect("Test called flush but dropped before finishing");
-                }
-            });
-
-            Self {
-                tx,
-                tx_pump,
-                executor_tx,
-                executed,
-            }
-        }
-
-        pub fn get_executed(&self) -> usize {
-            self.executed.load(Ordering::Relaxed)
-        }
-
-        pub async fn pump(&self) {
-            let (tx, rx) = tokio::sync::oneshot::channel();
-            self.tx_pump
-                .send(FlushOp { tx })
-                .await
-                .expect("pump called after deletion queue loop stopped");
-            rx.await
-                .expect("Mock delete queue shutdown while waiting to pump");
-        }
-
-        pub(crate) fn new_client(&self) -> DeletionQueueClient {
-            DeletionQueueClient {
-                tx: self.tx.clone(),
-                executor_tx: self.executor_tx.clone(),
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/backend.rs
+++ b/pageserver/src/deletion_queue/backend.rs
@@ -1,300 +0,0 @@
-use std::collections::HashMap;
-use std::time::Duration;
-
-use futures::future::TryFutureExt;
-use pageserver_api::control_api::HexTenantId;
-use pageserver_api::control_api::{ValidateRequest, ValidateRequestTenant, ValidateResponse};
-use serde::de::DeserializeOwned;
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-use utils::backoff;
-
-use crate::config::PageServerConf;
-use crate::metrics::DELETION_QUEUE_ERRORS;
-
-use super::executor::ExecutorMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::DeletionQueueError;
-use super::FlushOp;
-
-// After this length of time, execute deletions which are elegible to run,
-// even if we haven't accumulated enough for a full-sized DeleteObjects
-const EXECUTE_IDLE_DEADLINE: Duration = Duration::from_secs(60);
-
-// If we have received this number of keys, proceed with attempting to execute
-const AUTOFLUSH_KEY_COUNT: usize = 16384;
-
-#[derive(Debug)]
-pub(super) enum BackendQueueMessage {
-    Delete(DeletionList),
-    Flush(FlushOp),
-}
-pub struct BackendQueueWorker {
-    conf: &'static PageServerConf,
-    rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
-    tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-
-    // Accumulate some lists to execute in a batch.
-    // The purpose of this accumulation is to implement batched validation of
-    // attachment generations, when split-brain protection is implemented.
-    // (see https://github.com/neondatabase/neon/pull/4919)
-    pending_lists: Vec<DeletionList>,
-
-    // Sum of all the lengths of lists in pending_lists
-    pending_key_count: usize,
-
-    // DeletionLists we have fully executed, which may be deleted
-    // from remote storage.
-    executed_lists: Vec<DeletionList>,
-
-    cancel: CancellationToken,
-}
-
-#[derive(thiserror::Error, Debug)]
-enum ValidateCallError {
-    #[error("shutdown")]
-    Shutdown,
-    #[error("remote: {0}")]
-    Remote(reqwest::Error),
-}
-
-async fn retry_http_forever<T>(
-    url: &url::Url,
-    request: ValidateRequest,
-    cancel: CancellationToken,
-) -> Result<T, DeletionQueueError>
-where
-    T: DeserializeOwned,
-{
-    let client = reqwest::ClientBuilder::new()
-        .build()
-        .expect("Failed to construct http client");
-
-    let response = match backoff::retry(
-        || {
-            client
-                .post(url.clone())
-                .json(&request)
-                .send()
-                .map_err(|e| ValidateCallError::Remote(e))
-        },
-        |_| false,
-        3,
-        u32::MAX,
-        "calling control plane generation validation API",
-        backoff::Cancel::new(cancel.clone(), || ValidateCallError::Shutdown),
-    )
-    .await
-    {
-        Err(ValidateCallError::Shutdown) => {
-            return Err(DeletionQueueError::ShuttingDown);
-        }
-        Err(ValidateCallError::Remote(_)) => {
-            panic!("We retry forever");
-        }
-        Ok(r) => r,
-    };
-
-    // TODO: handle non-200 response
-    // TODO: handle decode error
-    Ok(response.json::<T>().await.unwrap())
-}
-
-impl BackendQueueWorker {
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<BackendQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<ExecutorMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            conf,
-            rx,
-            tx,
-            pending_lists: Vec::new(),
-            pending_key_count: 0,
-            executed_lists: Vec::new(),
-            cancel,
-        }
-    }
-
-    async fn cleanup_lists(&mut self) {
-        debug!(
-            "cleanup_lists: {0} executed lists, {1} pending lists",
-            self.executed_lists.len(),
-            self.pending_lists.len()
-        );
-
-        // Lists are always pushed into the queues + executed list in sequence order, so
-        // no sort is required: can find the highest sequence number by peeking at last element
-        let max_executed_seq = match self.executed_lists.last() {
-            Some(v) => v.sequence,
-            None => {
-                // No executed lists, nothing to clean up.
-                return;
-            }
-        };
-
-        // In case this is the last list, write a header out first so that
-        // we don't risk losing our knowledge of the sequence number (on replay, our
-        // next sequence number is the highest list seen + 1, or read from the header
-        // if there are no lists)
-        let header = DeletionHeader::new(max_executed_seq);
-        debug!("Writing header {:?}", header);
-        let header_bytes =
-            serde_json::to_vec(&header).expect("Failed to serialize deletion header");
-        let header_path = self.conf.deletion_header_path();
-
-        if let Err(e) = tokio::fs::write(&header_path, header_bytes).await {
-            warn!("Failed to upload deletion queue header: {e:#}");
-            DELETION_QUEUE_ERRORS
-                .with_label_values(&["put_header"])
-                .inc();
-            return;
-        }
-
-        while let Some(list) = self.executed_lists.pop() {
-            let list_path = self.conf.deletion_list_path(list.sequence);
-            if let Err(e) = tokio::fs::remove_file(&list_path).await {
-                // Unexpected: we should have permissions and nothing else should
-                // be touching these files
-                tracing::error!("Failed to delete {0}: {e:#}", list_path.display());
-                self.executed_lists.push(list);
-                break;
-            }
-        }
-    }
-
-    pub async fn validate_lists(&mut self) -> Result<(), DeletionQueueError> {
-        let control_plane_api = match &self.conf.control_plane_api {
-            None => {
-                // Generations are not switched on yet.
-                return Ok(());
-            }
-            Some(api) => api,
-        };
-
-        let validate_path = control_plane_api
-            .join("validate")
-            .expect("Failed to build validate path");
-
-        for list in &mut self.pending_lists {
-            let request = ValidateRequest {
-                tenants: list
-                    .tenants
-                    .iter()
-                    .map(|(tid, tdl)| ValidateRequestTenant {
-                        id: HexTenantId::new(*tid),
-                        gen: tdl.generation.into().expect(
-                            "Generation should always be valid for a Tenant doing deletions",
-                        ),
-                    })
-                    .collect(),
-            };
-
-            // Retry forever, we cannot make progress until we get a response
-            let response: ValidateResponse =
-                retry_http_forever(&validate_path, request, self.cancel.clone()).await?;
-
-            let tenants_valid: HashMap<_, _> = response
-                .tenants
-                .into_iter()
-                .map(|t| (t.id.take(), t.valid))
-                .collect();
-
-            // Filter the list based on whether the server responded valid: true.
-            // If a tenant is omitted in the response, it has been deleted, and we should
-            // proceed with deletion.
-            list.tenants.retain(|tenant_id, _tenant| {
-                let r = tenants_valid.get(tenant_id).map(|v| *v).unwrap_or(true);
-                if !r {
-                    warn!("Dropping stale deletions for tenant {tenant_id}, objects may be leaked");
-                }
-                r
-            });
-        }
-
-        Ok(())
-    }
-
-    pub async fn flush(&mut self) {
-        // Issue any required generation validation calls to the control plane
-        if let Err(DeletionQueueError::ShuttingDown) = self.validate_lists().await {
-            warn!("Shutting down");
-            return;
-        }
-
-        // Submit all keys from pending DeletionLists into the executor
-        for list in self.pending_lists.drain(..) {
-            let objects = list.take_paths();
-            if let Err(_e) = self.tx.send(ExecutorMessage::Delete(objects)).await {
-                warn!("Shutting down");
-                return;
-            };
-        }
-
-        // Flush the executor to ensure all the operations we just submitted have been executed
-        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
-        let flush_op = FlushOp { tx };
-        if let Err(_e) = self.tx.send(ExecutorMessage::Flush(flush_op)).await {
-            warn!("Shutting down");
-            return;
-        };
-        if rx.await.is_err() {
-            warn!("Shutting down");
-            return;
-        }
-
-        // After flush, we are assured that all contents of the pending lists
-        // are executed
-        self.pending_key_count = 0;
-        self.executed_lists.append(&mut self.pending_lists);
-
-        // Erase the lists we executed
-        self.cleanup_lists().await;
-    }
-
-    pub async fn background(&mut self) {
-        // TODO: if we would like to be able to defer deletions while a Layer still has
-        // refs (but it will be elegible for deletion after process ends), then we may
-        // add an ephemeral part to BackendQueueMessage::Delete that tracks which keys
-        // in the deletion list may not be deleted yet, with guards to block on while
-        // we wait to proceed.
-
-        loop {
-            let msg = match tokio::time::timeout(EXECUTE_IDLE_DEADLINE, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    break;
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending
-                    self.flush().await;
-
-                    continue;
-                }
-            };
-
-            match msg {
-                BackendQueueMessage::Delete(list) => {
-                    self.pending_key_count += list.len();
-                    self.pending_lists.push(list);
-
-                    if self.pending_key_count > AUTOFLUSH_KEY_COUNT {
-                        self.flush().await;
-                    }
-                }
-                BackendQueueMessage::Flush(op) => {
-                    self.flush().await;
-                    op.fire();
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/executor.rs
+++ b/pageserver/src/deletion_queue/executor.rs
@@ -1,143 +0,0 @@
-use remote_storage::GenericRemoteStorage;
-use remote_storage::RemotePath;
-use remote_storage::MAX_KEYS_PER_DELETE;
-use std::time::Duration;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-
-use crate::metrics::DELETION_QUEUE_ERRORS;
-use crate::metrics::DELETION_QUEUE_EXECUTED;
-
-use super::DeletionQueueError;
-use super::FlushOp;
-
-const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);
-
-pub(super) enum ExecutorMessage {
-    Delete(Vec<RemotePath>),
-    Flush(FlushOp),
-}
-
-/// Non-persistent deletion queue, for coalescing multiple object deletes into
-/// larger DeleteObjects requests.
-pub struct ExecutorWorker {
-    // Accumulate up to 1000 keys for the next deletion operation
-    accumulator: Vec<RemotePath>,
-
-    rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
-
-    cancel: CancellationToken,
-    remote_storage: GenericRemoteStorage,
-}
-
-impl ExecutorWorker {
-    pub(super) fn new(
-        remote_storage: GenericRemoteStorage,
-        rx: tokio::sync::mpsc::Receiver<ExecutorMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            remote_storage,
-            rx,
-            cancel,
-            accumulator: Vec::new(),
-        }
-    }
-
-    /// Wrap the remote `delete_objects` with a failpoint
-    pub async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            DELETION_QUEUE_ERRORS
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
-        self.remote_storage.delete_objects(&self.accumulator).await
-    }
-
-    /// Block until everything in accumulator has been executed
-    pub async fn flush(&mut self) -> Result<(), DeletionQueueError> {
-        while !self.accumulator.is_empty() && !self.cancel.is_cancelled() {
-            match self.remote_delete().await {
-                Ok(()) => {
-                    // Note: we assume that the remote storage layer returns Ok(()) if some
-                    // or all of the deleted objects were already gone.
-                    DELETION_QUEUE_EXECUTED.inc_by(self.accumulator.len() as u64);
-                    info!(
-                        "Executed deletion batch {}..{}",
-                        self.accumulator
-                            .first()
-                            .expect("accumulator should be non-empty"),
-                        self.accumulator
-                            .last()
-                            .expect("accumulator should be non-empty"),
-                    );
-                    self.accumulator.clear();
-                }
-                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
-                    DELETION_QUEUE_ERRORS.with_label_values(&["execute"]).inc();
-                }
-            };
-        }
-        if self.cancel.is_cancelled() {
-            // Expose an error because we may not have actually flushed everything
-            Err(DeletionQueueError::ShuttingDown)
-        } else {
-            Ok(())
-        }
-    }
-
-    pub async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
-
-        loop {
-            if self.cancel.is_cancelled() {
-                return Err(DeletionQueueError::ShuttingDown);
-            }
-
-            let msg = match tokio::time::timeout(AUTOFLUSH_INTERVAL, self.rx.recv()).await {
-                Ok(Some(m)) => m,
-                Ok(None) => {
-                    // All queue senders closed
-                    info!("Shutting down");
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
-                Err(_) => {
-                    // Timeout, we hit deadline to execute whatever we have in hand.  These functions will
-                    // return immediately if no work is pending
-                    self.flush().await?;
-
-                    continue;
-                }
-            };
-
-            match msg {
-                ExecutorMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                            self.flush().await?;
-                            // If we have received this number of keys, proceed with attempting to execute
-                            assert_eq!(self.accumulator.len(), 0);
-                        }
-
-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
-                        let take_count = std::cmp::min(available_slots, list.len());
-                        for path in list.drain(list.len() - take_count..) {
-                            self.accumulator.push(path);
-                        }
-                    }
-                }
-                ExecutorMessage::Flush(flush_op) => {
-                    // If flush() errors, we drop the flush_op and the caller will get
-                    // an error recv()'ing their oneshot channel.
-                    self.flush().await?;
-                    flush_op.fire();
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/deletion_queue/frontend.rs
+++ b/pageserver/src/deletion_queue/frontend.rs
@@ -1,376 +0,0 @@
-use super::BackendQueueMessage;
-use super::DeletionHeader;
-use super::DeletionList;
-use super::FlushOp;
-
-use std::fs::create_dir_all;
-use std::time::Duration;
-
-use regex::Regex;
-use remote_storage::RemotePath;
-use tokio_util::sync::CancellationToken;
-use tracing::debug;
-use tracing::info;
-use tracing::warn;
-use utils::generation::Generation;
-use utils::id::TenantId;
-use utils::id::TimelineId;
-
-use crate::config::PageServerConf;
-use crate::metrics::DELETION_QUEUE_ERRORS;
-use crate::metrics::DELETION_QUEUE_SUBMITTED;
-use crate::tenant::remote_timeline_client::remote_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
-
-// The number of keys in a DeletionList before we will proactively persist it
-// (without reaching a flush deadline).  This aims to deliver objects of the order
-// of magnitude 1MB when we are under heavy delete load.
-const DELETION_LIST_TARGET_SIZE: usize = 16384;
-
-// Ordinarily, we only flush to DeletionList periodically, to bound the window during
-// which we might leak objects from not flushing a DeletionList after
-// the objects are already unlinked from timeline metadata.
-const FRONTEND_DEFAULT_TIMEOUT: Duration = Duration::from_millis(10000);
-
-// If someone is waiting for a flush to DeletionList, only delay a little to accumulate
-// more objects before doing the flush.
-const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
-
-#[derive(Debug)]
-pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
-    pub(super) timeline_id: TimelineId,
-    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
-    // have a config object handy to project it to a remote key, and need the consuming worker
-    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
-    pub(super) objects: Vec<RemotePath>,
-
-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
-    /// this deletion.
-    pub(super) generation: Generation,
-}
-
-#[derive(Debug)]
-pub(super) enum FrontendQueueMessage {
-    Delete(DeletionOp),
-    // Wait until all prior deletions make it into a persistent DeletionList
-    Flush(FlushOp),
-    // Wait until all prior deletions have been executed (i.e. objects are actually deleted)
-    FlushExecute(FlushOp),
-}
-
-pub struct FrontendQueueWorker {
-    conf: &'static PageServerConf,
-
-    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
-
-    // Outbound requests to the backend to execute deletion lists we have composed.
-    tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
-
-    // The list we are currently building, contains a buffer of keys to delete
-    // and our next sequence number
-    pending: DeletionList,
-
-    // These FlushOps should fire the next time we flush
-    pending_flushes: Vec<FlushOp>,
-
-    // Worker loop is torn down when this fires.
-    cancel: CancellationToken,
-}
-
-impl FrontendQueueWorker {
-    pub(super) fn new(
-        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<FrontendQueueMessage>,
-        tx: tokio::sync::mpsc::Sender<BackendQueueMessage>,
-        cancel: CancellationToken,
-    ) -> Self {
-        Self {
-            pending: DeletionList::new(1),
-            conf,
-            rx,
-            tx,
-            pending_flushes: Vec::new(),
-            cancel,
-        }
-    }
-    async fn upload_pending_list(&mut self) -> anyhow::Result<()> {
-        let path = self.conf.deletion_list_path(self.pending.sequence);
-
-        let bytes = serde_json::to_vec(&self.pending).expect("Failed to serialize deletion list");
-        tokio::fs::write(&path, &bytes).await?;
-        tokio::fs::File::open(&path).await?.sync_all().await?;
-        Ok(())
-    }
-
-    /// Try to flush `list` to persistent storage
-    ///
-    /// This does not return errors, because on failure to flush we do not lose
-    /// any state: flushing will be retried implicitly on the next deadline
-    async fn flush(&mut self) {
-        if self.pending.is_empty() {
-            for f in self.pending_flushes.drain(..) {
-                f.fire();
-            }
-            return;
-        }
-
-        match self.upload_pending_list().await {
-            Ok(_) => {
-                info!(sequence = self.pending.sequence, "Stored deletion list");
-
-                for f in self.pending_flushes.drain(..) {
-                    f.fire();
-                }
-
-                let onward_list = self.pending.drain();
-
-                // We have consumed out of pending: reset it for the next incoming deletions to accumulate there
-                self.pending = DeletionList::new(self.pending.sequence + 1);
-
-                if let Err(e) = self.tx.send(BackendQueueMessage::Delete(onward_list)).await {
-                    // This is allowed to fail: it will only happen if the backend worker is shut down,
-                    // so we can just drop this on the floor.
-                    info!("Deletion list dropped, this is normal during shutdown ({e:#})");
-                }
-            }
-            Err(e) => {
-                DELETION_QUEUE_ERRORS.with_label_values(&["put_list"]).inc();
-                warn!(
-                    sequence = self.pending.sequence,
-                    "Failed to write deletion list to remote storage, will retry later ({e:#})"
-                );
-            }
-        }
-    }
-
-    async fn recover(&mut self) -> Result<(), anyhow::Error> {
-        // Load header: this is not required to be present, e.g. when a pageserver first runs
-        let header_path = self.conf.deletion_header_path();
-
-        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        create_dir_all(&self.conf.deletion_prefix())?;
-
-        let header_bytes = match tokio::fs::read(&header_path).await {
-            Ok(h) => Ok(Some(h)),
-            Err(e) => {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    debug!(
-                        "Deletion header {0} not found, first start?",
-                        header_path.display()
-                    );
-                    Ok(None)
-                } else {
-                    Err(e)
-                }
-            }
-        }?;
-
-        if let Some(header_bytes) = header_bytes {
-            if let Some(header) = match serde_json::from_slice::<DeletionHeader>(&header_bytes) {
-                Ok(h) => Some(h),
-                Err(e) => {
-                    warn!(
-                        "Failed to deserialize deletion header, ignoring {0}: {e:#}",
-                        header_path.display()
-                    );
-                    // This should never happen unless we make a mistake with our serialization.
-                    // Ignoring a deletion header is not consequential for correctnes because all deletions
-                    // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
-                    None
-                }
-            } {
-                self.pending.sequence =
-                    std::cmp::max(self.pending.sequence, header.last_deleted_list_seq + 1);
-            };
-        };
-
-        let mut dir = match tokio::fs::read_dir(&self.conf.deletion_prefix()).await {
-            Ok(d) => d,
-            Err(e) => {
-                warn!(
-                    "Failed to open deletion list directory {0}: {e:#}",
-                    header_path.display()
-                );
-
-                // Give up: if we can't read the deletion list directory, we probably can't
-                // write lists into it later, so the queue won't work.
-                return Err(e.into());
-            }
-        };
-
-        let list_name_pattern = Regex::new("([a-zA-Z0-9]{16})-([a-zA-Z0-9]{2}).list").unwrap();
-
-        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await? {
-            let file_name = dentry.file_name().to_owned();
-            let basename = file_name.to_string_lossy();
-            let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
-                m.get(1)
-                    .expect("Non optional group should be present")
-                    .as_str()
-            } else {
-                warn!("Unexpected key in deletion queue: {basename}");
-                continue;
-            };
-
-            let seq: u64 = match u64::from_str_radix(seq_part, 16) {
-                Ok(s) => s,
-                Err(e) => {
-                    warn!("Malformed key '{basename}': {e}");
-                    continue;
-                }
-            };
-            seqs.push(seq);
-        }
-        seqs.sort();
-
-        // Initialize the next sequence number in the frontend based on the maximum of the highest list we see,
-        // and the last list that was deleted according to the header.  Combined with writing out the header
-        // prior to deletions, this guarnatees no re-use of sequence numbers.
-        if let Some(max_list_seq) = seqs.last() {
-            self.pending.sequence = std::cmp::max(self.pending.sequence, max_list_seq + 1);
-        }
-
-        for s in seqs {
-            let list_path = self.conf.deletion_list_path(s);
-            let list_bytes = tokio::fs::read(&list_path).await?;
-
-            let deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
-                Ok(l) => l,
-                Err(e) => {
-                    // Drop the list on the floor: any objects it referenced will be left behind
-                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
-                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
-                    continue;
-                }
-            };
-
-            // We will drop out of recovery if this fails: it indicates that we are shutting down
-            // or the backend has panicked
-            DELETION_QUEUE_SUBMITTED.inc_by(deletion_list.len() as u64);
-            self.tx
-                .send(BackendQueueMessage::Delete(deletion_list))
-                .await?;
-        }
-
-        info!(next_sequence = self.pending.sequence, "Replay complete");
-
-        Ok(())
-    }
-
-    /// This is the front-end ingest, where we bundle up deletion requests into DeletionList
-    /// and write them out, for later
-    pub async fn background(&mut self) {
-        info!("Started deletion frontend worker");
-
-        let mut recovered: bool = false;
-
-        while !self.cancel.is_cancelled() {
-            let timeout = if self.pending_flushes.is_empty() {
-                FRONTEND_DEFAULT_TIMEOUT
-            } else {
-                FRONTEND_FLUSHING_TIMEOUT
-            };
-
-            let msg = match tokio::time::timeout(timeout, self.rx.recv()).await {
-                Ok(Some(msg)) => msg,
-                Ok(None) => {
-                    // Queue sender destroyed, shutting down
-                    break;
-                }
-                Err(_) => {
-                    // Hit deadline, flush.
-                    self.flush().await;
-                    continue;
-                }
-            };
-
-            // On first message, do recovery.  This avoids unnecessary recovery very
-            // early in startup, and simplifies testing by avoiding a 404 reading the
-            // header on every first pageserver startup.
-            if !recovered {
-                // Before accepting any input from this pageserver lifetime, recover all deletion lists that are in S3
-                if let Err(e) = self.recover().await {
-                    // This should only happen in truly unrecoverable cases, like the recovery finding that the backend
-                    // queue receiver has been dropped.
-                    info!("Deletion queue recover aborted, deletion queue will not proceed ({e})");
-                    return;
-                } else {
-                    recovered = true;
-                }
-            }
-
-            match msg {
-                FrontendQueueMessage::Delete(op) => {
-                    debug!(
-                        "Delete: ingesting {0} layers, {1} other objects",
-                        op.layers.len(),
-                        op.objects.len()
-                    );
-
-                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
-                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            &layer,
-                            generation,
-                        ));
-                    }
-                    layer_paths.extend(op.objects);
-
-                    if self.pending.push(
-                        &op.tenant_id,
-                        &op.timeline_id,
-                        op.generation,
-                        &mut layer_paths,
-                    ) == false
-                    {
-                        self.flush().await;
-                        let retry = self.pending.push(
-                            &op.tenant_id,
-                            &op.timeline_id,
-                            op.generation,
-                            &mut layer_paths,
-                        );
-                        if retry != true {
-                            // Unexpeted: after we flush, we should have
-                            // drained self.pending, so a conflict on
-                            // generation numbers should be impossible.
-                            tracing::error!(
-                                "Failed to enqueue deletions, leaking objects.  This is a bug."
-                            );
-                        }
-                    }
-                }
-                FrontendQueueMessage::Flush(op) => {
-                    if self.pending.is_empty() {
-                        // Execute immediately
-                        debug!("Flush: No pending objects, flushing immediately");
-                        op.fire()
-                    } else {
-                        // Execute next time we flush
-                        debug!("Flush: adding to pending flush list for next deadline flush");
-                        self.pending_flushes.push(op);
-                    }
-                }
-                FrontendQueueMessage::FlushExecute(op) => {
-                    debug!("FlushExecute: passing through to backend");
-                    // We do not flush to a deletion list here: the client sends a Flush before the FlushExecute
-                    if let Err(e) = self.tx.send(BackendQueueMessage::Flush(op)).await {
-                        info!("Can't flush, shutting down ({e})");
-                        // Caller will get error when their oneshot sender was dropped.
-                    }
-                }
-            }
-
-            if self.pending.len() > DELETION_LIST_TARGET_SIZE || !self.pending_flushes.is_empty() {
-                self.flush().await;
-            }
-        }
-        info!("Deletion queue shut down.");
-    }
-}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,29 +52,6 @@ paths:
              schema:
                type: object

-  /v1/deletion_queue/flush:
-    parameters:
-      - name: execute
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description:
-          If true, attempt to execute deletions.  If false, just flush deletions to persistent deletion lists.
-    put:
-      description: Execute any deletions currently enqueued
-      security: []
-      responses:
-        "200":
-          description: |
-            Flush completed: if execute was true, then enqueued deletions have been completed.  If execute was false,
-            then enqueued deletions have been persisted to deletion lists, and may have been completed.
-          content:
-            application/json:
-              schema:
-                type: object
-
-
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,9 @@ use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
+use pageserver_api::models::{
+    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
+};
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -23,7 +25,6 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::{DeletionQueue, DeletionQueueError};
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -59,7 +60,6 @@ struct State {
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: DeletionQueue,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }
@@ -69,7 +69,6 @@ impl State {
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
-        deletion_queue: DeletionQueue,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    ) -> anyhow::Result<Self> {
@@ -83,7 +82,6 @@ impl State {
            allowlist_routes,
            remote_storage,
            broker_client,
-            deletion_queue,
            disk_usage_eviction_state,
        })
    }
@@ -489,20 +487,7 @@ async fn tenant_attach_handler(

    let state = get_state(&request);

-    let generation = if state.conf.control_plane_api.is_some() {
-        // If we have been configured with a control plane URI, then generations are
-        // mandatory, as we will attempt to re-attach on startup.
-        maybe_body
-            .as_ref()
-            .map(|tar| tar.generation)
-            .flatten()
-            .map(|g| Generation::new(g))
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))?
-    } else {
-        Generation::none()
-    };
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

    if let Some(remote_storage) = &state.remote_storage {
        mgr::attach_tenant(
@@ -512,7 +497,6 @@ async fn tenant_attach_handler(
            tenant_conf,
            state.broker_client.clone(),
            remote_storage.clone(),
-            &state.deletion_queue,
            &ctx,
        )
        .instrument(info_span!("tenant_attach", %tenant_id))
@@ -561,7 +545,7 @@ async fn tenant_detach_handler(
 }

 async fn tenant_load_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
@@ -569,13 +553,20 @@ async fn tenant_load_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

+    let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
+
    let state = get_state(&request);
+
+    // The /load request is only usable when control_plane_api is not set.  Once it is set, callers
+    // should always use /attach instead.
+    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
+
    mgr::load_tenant(
        state.conf,
        tenant_id,
+        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
-        &state.deletion_queue,
        &ctx,
    )
    .instrument(info_span!("load", %tenant_id))
@@ -875,6 +866,21 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

+/// Helper for requests that may take a generation, which is mandatory
+/// when control_plane_api is set, but otherwise defaults to Generation::none()
+fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
+    if state.conf.control_plane_api.is_some() {
+        req_gen
+            .map(Generation::new)
+            .ok_or(ApiError::BadRequest(anyhow!(
+                "generation attribute missing"
+            )))
+    } else {
+        // Legacy mode: all tenants operate with no generation
+        Ok(Generation::none())
+    }
+}
+
 async fn tenant_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -891,16 +897,12 @@ async fn tenant_create_handler(
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    // TODO: make generation mandatory here once control plane supports it.
-    let generation = request_data
-        .generation
-        .map(|g| Generation::new(g))
-        .unwrap_or(Generation::none());
+    let state = get_state(&request);
+
+    let generation = get_request_generation(state, request_data.generation)?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let state = get_state(&request);
-
    let new_tenant = mgr::create_tenant(
        state.conf,
        tenant_conf,
@@ -908,7 +910,6 @@ async fn tenant_create_handler(
        generation,
        state.broker_client.clone(),
        state.remote_storage.clone(),
-        &state.deletion_queue,
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
@@ -1149,48 +1150,6 @@ async fn always_panic_handler(
    json_response(StatusCode::NO_CONTENT, ())
 }

-async fn deletion_queue_flush(
-    r: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&r);
-
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
-    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
-
-    let queue_client = state.deletion_queue.new_client();
-
-    tokio::select! {
-        flush_result = async {
-            if execute {
-                queue_client.flush_execute().await
-            } else {
-                queue_client.flush().await
-            }
-        } => {
-            match flush_result {
-                Ok(())=> {
-                    json_response(StatusCode::OK, ())
-                },
-                Err(e) => {
-                    match e {
-                        DeletionQueueError::ShuttingDown => {
-            Err(ApiError::ShuttingDown)
-                        }
-                    }
-                }
-            }
-        },
-        _ = cancel.cancelled() => {
-            Err(ApiError::ShuttingDown)
-        }
-    }
-}
-
 async fn disk_usage_eviction_run(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -1400,7 +1359,6 @@ pub fn make_router(
    auth: Option<Arc<JwtAuth>>,
    broker_client: BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: DeletionQueue,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
@@ -1430,7 +1388,6 @@ pub fn make_router(
                conf,
                auth,
                remote_storage,
-                deletion_queue,
                broker_client,
                disk_usage_eviction_state,
            )
@@ -1515,9 +1472,6 @@ pub fn make_router(
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
-        .put("/v1/deletion_queue/flush", |r| {
-            api_handler(r, deletion_queue_flush)
-        })
        .put("/v1/tenant/:tenant_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,7 +3,7 @@ pub mod basebackup;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
-pub mod deletion_queue;
+mod control_plane_client;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -795,31 +795,6 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-pub(crate) static DELETION_QUEUE_SUBMITTED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_deletion_queue_submitted_total",
-        "Number of objects submitted for deletion"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static DELETION_QUEUE_EXECUTED: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted"
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static DELETION_QUEUE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_deletion_queue_errors_total",
-        "Incremented on retryable remote I/O errors writing deletion lists or executing deletions.",
-        &["op_kind"],
-    )
-    .expect("failed to define a metric")
-});
-
 static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -469,7 +469,9 @@ impl PageServerHandler {
        // Create empty timeline
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -32,9 +32,7 @@ use std::fmt::Debug;
 use std::fmt::Display;
 use std::fs;
 use std::fs::File;
-use std::fs::OpenOptions;
 use std::io;
-use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
 use std::path::PathBuf;
@@ -59,7 +57,6 @@ use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueueClient;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT_ACTIVATION;
@@ -69,7 +66,7 @@ use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
-use crate::tenant::remote_timeline_client::index::IndexPart;
+pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
@@ -116,12 +113,11 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
-pub mod manifest;
 mod span;

 pub mod metadata;
 mod par_fsync;
-pub mod remote_timeline_client;
+mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
@@ -145,6 +141,9 @@ pub use crate::tenant::metadata::save_metadata;
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;

+/// The "tenants" part of `tenants/<tenant>/timelines...`
+pub const TENANTS_SEGMENT_NAME: &str = "tenants";
+
 /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
 pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

@@ -158,7 +157,6 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: Option<GenericRemoteStorage>,
-    pub deletion_queue_client: DeletionQueueClient,
 }

 ///
@@ -182,7 +180,8 @@ pub struct Tenant {

    tenant_id: TenantId,

-    // The remote storage generation, used to protect S3 objects from split-brain
+    /// The remote storage generation, used to protect S3 objects from split-brain.
+    /// Does not change over the lifetime of the [`Tenant`] object.
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
@@ -196,10 +195,7 @@ pub struct Tenant {
    walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,

    // provides access to timeline data sitting in the remote storage
-    remote_storage: Option<GenericRemoteStorage>,
-
-    // Access to global deletion queue for when this tenant wants to schedule a deletion
-    deletion_queue_client: Option<DeletionQueueClient>,
+    pub(crate) remote_storage: Option<GenericRemoteStorage>,

    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -411,7 +407,6 @@ impl Tenant {
        remote_startup_data: Option<RemoteStartupData>,
        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
-        first_save: bool,
        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -445,14 +440,9 @@ impl Tenant {

        // Save the metadata file to local disk.
        if !picked_local {
-            save_metadata(
-                self.conf,
-                &tenant_id,
-                &timeline_id,
-                up_to_date_metadata,
-                first_save,
-            )
-            .context("save_metadata")?;
+            save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
+                .await
+                .context("save_metadata")?;
        }

        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
@@ -536,7 +526,6 @@ impl Tenant {
        broker_client: storage_broker::BrokerClientChannel,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
-        deletion_queue_client: DeletionQueueClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
@@ -552,7 +541,6 @@ impl Tenant {
            tenant_id,
            generation,
            Some(remote_storage.clone()),
-            Some(deletion_queue_client),
        ));

        // Do all the hard work in the background
@@ -738,7 +726,6 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client: Some(remote_client),
-                    deletion_queue_client: self.deletion_queue_client.clone(),
                },
                ctx,
            )
@@ -763,7 +750,6 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                Some(remote_timeline_client),
-                self.deletion_queue_client.clone(),
                None,
            )
            .await
@@ -841,7 +827,6 @@ impl Tenant {
            }),
            local_metadata,
            ancestor,
-            true,
            None,
            ctx,
        )
@@ -866,7 +851,6 @@ impl Tenant {
            tenant_id,
            Generation::broken(),
            None,
-            None,
        ))
    }

@@ -901,7 +885,6 @@ impl Tenant {

        let broker_client = resources.broker_client;
        let remote_storage = resources.remote_storage;
-        let deletion_queue_client = resources.deletion_queue_client;

        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Tenant::new(
@@ -912,7 +895,6 @@ impl Tenant {
            tenant_id,
            generation,
            remote_storage.clone(),
-            Some(deletion_queue_client),
        );
        let tenant = Arc::new(tenant);

@@ -1320,7 +1302,6 @@ impl Tenant {
                                timeline_id,
                                &local_metadata,
                                Some(remote_client),
-                                self.deletion_queue_client.clone(),
                                init_order,
                            )
                            .await
@@ -1370,7 +1351,6 @@ impl Tenant {
                        timeline_id,
                        &local_metadata,
                        None,
-                        None,
                        init_order,
                    )
                    .await
@@ -1399,7 +1379,6 @@ impl Tenant {
            remote_startup_data,
            Some(local_metadata),
            ancestor,
-            false,
            init_order,
            ctx,
        )
@@ -1463,7 +1442,7 @@ impl Tenant {
    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
    /// minimum amount of keys required to get a writable timeline.
    /// (Without it, `put` might fail due to `repartition` failing.)
-    pub fn create_empty_timeline(
+    pub async fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
@@ -1475,10 +1454,10 @@ impl Tenant {
            "Cannot create empty timelines on inactive tenant"
        );

-        let timelines = self.timelines.lock().unwrap();
-        let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?;
-        drop(timelines);
-
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
+        };
        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
@@ -1497,6 +1476,7 @@ impl Tenant {
            initdb_lsn,
            None,
        )
+        .await
    }

    /// Helper for unit tests to create an empty timeline.
@@ -1512,7 +1492,9 @@ impl Tenant {
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let uninit_tl = self.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)?;
+        let uninit_tl = self
+            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
        assert_eq!(tline.get_last_record_lsn(), Lsn(0));

@@ -1530,6 +1512,15 @@ impl Tenant {
        tline.maybe_spawn_flush_loop();
        tline.freeze_and_flush().await.context("freeze_and_flush")?;

+        // Make sure the freeze_and_flush reaches remote storage.
+        tline
+            .remote_client
+            .as_ref()
+            .unwrap()
+            .wait_completion()
+            .await
+            .unwrap();
+
        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
@@ -1706,65 +1697,6 @@ impl Tenant {
        Ok(())
    }

-    /// Flush all in-memory data to disk and remote storage, if any.
-    ///
-    /// Used at graceful shutdown.
-    async fn freeze_and_flush_on_shutdown(&self) {
-        let mut js = tokio::task::JoinSet::new();
-
-        // execute on each timeline on the JoinSet, join after.
-        let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
-            async move {
-                debug_assert_current_span_has_tenant_and_timeline_id();
-
-                match timeline.freeze_and_flush().await {
-                    Ok(()) => {}
-                    Err(e) => {
-                        warn!("failed to freeze and flush: {e:#}");
-                        return;
-                    }
-                }
-
-                let res = if let Some(client) = timeline.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    client.wait_completion().await
-                } else {
-                    Ok(())
-                };
-
-                if let Err(e) = res {
-                    warn!("failed to await for frozen and flushed uploads: {e:#}");
-                }
-            }
-            .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
-        };
-
-        {
-            let timelines = self.timelines.lock().unwrap();
-            timelines
-                .iter()
-                .map(|(id, tl)| (*id, Arc::clone(tl)))
-                .for_each(|(timeline_id, timeline)| {
-                    js.spawn(per_timeline(timeline_id, timeline));
-                })
-        };
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok(()) => {}
-                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
-                Err(je) if je.is_panic() => { /* logged already */ }
-                Err(je) => warn!("unexpected JoinError: {je:?}"),
-            }
-        }
-    }
-
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -1895,19 +1827,22 @@ impl Tenant {
            }
        };

-        if freeze_and_flush {
-            // walreceiver has already began to shutdown with TenantState::Stopping, but we need to
-            // await for them to stop.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::WalReceiverManager),
-                Some(self.tenant_id),
-                None,
-            )
-            .await;
-
-            // this will wait for uploads to complete; in the past, it was done outside tenant
-            // shutdown in pageserver::shutdown_pageserver.
-            self.freeze_and_flush_on_shutdown().await;
+        let mut js = tokio::task::JoinSet::new();
+        {
+            let timelines = self.timelines.lock().unwrap();
+            timelines.values().for_each(|timeline| {
+                let timeline = Arc::clone(timeline);
+                let span = Span::current();
+                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+            })
+        };
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
+                Err(je) if je.is_panic() => { /* logged already */ }
+                Err(je) => warn!("unexpected JoinError: {je:?}"),
+            }
        }

        // shutdown all tenant and timeline tasks: gc, compaction, page service
@@ -2315,16 +2250,7 @@ impl Tenant {
        tenant_id: TenantId,
        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
-        deletion_queue_client: Option<DeletionQueueClient>,
    ) -> Tenant {
-        #[cfg(not(test))]
-        match state {
-            TenantState::Broken { .. } => {}
-            _ => {
-                // Non-broken tenants must be constructed with a deletion queue
-                assert!(deletion_queue_client.is_some());
-            }
-        }
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
@@ -2391,7 +2317,6 @@ impl Tenant {
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
-            deletion_queue_client,
            state,
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
@@ -2444,72 +2369,37 @@ impl Tenant {
        Ok(tenant_conf)
    }

-    pub(super) fn persist_tenant_config(
+    #[tracing::instrument(skip_all, fields(%tenant_id))]
+    pub(super) async fn persist_tenant_config(
        tenant_id: &TenantId,
        target_config_path: &Path,
        tenant_conf: TenantConfOpt,
-        creating_tenant: bool,
    ) -> anyhow::Result<()> {
-        let _enter = info_span!("saving tenantconf").entered();
-
        // imitate a try-block with a closure
-        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
-            let target_config_parent = target_config_path.parent().with_context(|| {
-                format!(
-                    "Config path does not have a parent: {}",
-                    target_config_path.display()
-                )
-            })?;
+        info!("persisting tenantconf to {}", target_config_path.display());

-            info!("persisting tenantconf to {}", target_config_path.display());
-
-            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.

 [tenant_config]
 "#
-            .to_string();
+        .to_string();

-            // Convert the config to a toml file.
-            conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
+        // Convert the config to a toml file.
+        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

-            let mut target_config_file = VirtualFile::open_with_options(
-                target_config_path,
-                OpenOptions::new()
-                    .truncate(true) // This needed for overwriting with small config files
-                    .write(true)
-                    .create_new(creating_tenant)
-                    // when creating a new tenant, first_save will be true and `.create(true)` will be
-                    // ignored (per rust std docs).
-                    //
-                    // later when updating the config of created tenant, or persisting config for the
-                    // first time for attached tenant, the `.create(true)` is used.
-                    .create(true),
-            )?;
+        let conf_content = conf_content.as_bytes();

-            target_config_file
-                .write(conf_content.as_bytes())
-                .context("write toml bytes into file")
-                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
-                .context("write config file")?;
-
-            // fsync the parent directory to ensure the directory entry is durable.
-            // before this was done conditionally on creating_tenant, but these management actions are rare
-            // enough to just fsync it always.
-
-            crashsafe::fsync(target_config_parent)?;
-            // XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
-            Ok(())
-        };
-
-        // this function is called from creating the tenant and updating the tenant config, which
-        // would otherwise share this context, so keep it here in one place.
-        do_persist(target_config_path).with_context(|| {
-            format!(
-                "write tenant {tenant_id} config to {}",
-                target_config_path.display()
-            )
-        })
+        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
+        VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!(
+                    "write tenant {tenant_id} config to {}",
+                    target_config_path.display()
+                )
+            })?;
+        Ok(())
    }

    //
@@ -2820,13 +2710,15 @@ impl Tenant {
            src_timeline.pg_version,
        );

-        let uninitialized_timeline = self.prepare_new_timeline(
-            dst_id,
-            &metadata,
-            timeline_uninit_mark,
-            start_lsn + 1,
-            Some(Arc::clone(src_timeline)),
-        )?;
+        let uninitialized_timeline = self
+            .prepare_new_timeline(
+                dst_id,
+                &metadata,
+                timeline_uninit_mark,
+                start_lsn + 1,
+                Some(Arc::clone(src_timeline)),
+            )
+            .await?;

        let new_timeline = uninitialized_timeline.finish_creation()?;

@@ -2904,13 +2796,15 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let raw_timeline = self.prepare_new_timeline(
-            timeline_id,
-            &new_metadata,
-            timeline_uninit_mark,
-            pgdata_lsn,
-            None,
-        )?;
+        let raw_timeline = self
+            .prepare_new_timeline(
+                timeline_id,
+                &new_metadata,
+                timeline_uninit_mark,
+                pgdata_lsn,
+                None,
+            )
+            .await?;

        let tenant_id = raw_timeline.owning_tenant.tenant_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2972,10 +2866,7 @@ impl Tenant {
            None
        };

-        TimelineResources {
-            remote_client,
-            deletion_queue_client: self.deletion_queue_client.clone(),
-        }
+        TimelineResources { remote_client }
    }

    /// Creates intermediate timeline structure and its files.
@@ -2984,7 +2875,7 @@ impl Tenant {
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map and to remove the
    /// uninit mark file.
-    fn prepare_new_timeline(
+    async fn prepare_new_timeline(
        &self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
@@ -3012,8 +2903,9 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) =
-            self.create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
+        if let Err(e) = self
+            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
+            .await
        {
            error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(uninit_mark);
@@ -3029,7 +2921,7 @@ impl Tenant {
        ))
    }

-    fn create_timeline_files(
+    async fn create_timeline_files(
        &self,
        timeline_path: &Path,
        new_timeline_id: &TimelineId,
@@ -3041,14 +2933,9 @@ impl Tenant {
            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

-        save_metadata(
-            self.conf,
-            &self.tenant_id,
-            new_timeline_id,
-            new_metadata,
-            true,
-        )
-        .context("Failed to create timeline metadata")?;
+        save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
+            .await
+            .context("Failed to create timeline metadata")?;
        Ok(())
    }

@@ -3195,7 +3082,7 @@ pub(crate) enum CreateTenantFilesMode {
    Attach,
 }

-pub(crate) fn create_tenant_files(
+pub(crate) async fn create_tenant_files(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3231,7 +3118,8 @@ pub(crate) fn create_tenant_files(
        mode,
        &temporary_tenant_dir,
        &target_tenant_directory,
-    );
+    )
+    .await;

    if creation_result.is_err() {
        error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
@@ -3249,7 +3137,7 @@ pub(crate) fn create_tenant_files(
    Ok(target_tenant_directory)
 }

-fn try_create_target_tenant_dir(
+async fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: &TenantId,
@@ -3288,7 +3176,7 @@ fn try_create_target_tenant_dir(
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

-    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
+    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;

    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
@@ -3493,6 +3381,8 @@ pub mod harness {
        pub tenant_conf: TenantConf,
        pub tenant_id: TenantId,
        pub generation: Generation,
+        pub remote_storage: GenericRemoteStorage,
+        pub remote_fs_dir: PathBuf,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3530,30 +3420,39 @@ pub mod harness {
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

+            use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
+            let remote_fs_dir = conf.workdir.join("localfs");
+            std::fs::create_dir_all(&remote_fs_dir).unwrap();
+            let config = RemoteStorageConfig {
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
+                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            };
+            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
+
            Ok(Self {
                conf,
                tenant_conf,
                tenant_id,
                generation: Generation::new(0xdeadbeef),
+                remote_storage,
+                remote_fs_dir,
            })
        }

        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
-                self.try_load(&ctx, None, None)
+                self.try_load(&ctx)
                    .await
                    .expect("failed to load test tenant"),
                ctx,
            )
        }

-        pub async fn try_load(
-            &self,
-            ctx: &RequestContext,
-            remote_storage: Option<remote_storage::GenericRemoteStorage>,
-            deletion_queue_client: Option<DeletionQueueClient>,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
            let walredo_mgr = Arc::new(TestRedoManager);

            let tenant = Arc::new(Tenant::new(
@@ -3563,8 +3462,7 @@ pub mod harness {
                walredo_mgr,
                self.tenant_id,
                self.generation,
-                remote_storage,
-                deletion_queue_client,
+                Some(self.remote_storage.clone()),
            ));
            tenant
                .load(None, ctx)
@@ -3677,7 +3575,10 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) {
+        match tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+        {
            Ok(_) => panic!("duplicate timeline creation should fail"),
            Err(e) => assert_eq!(
                e.to_string(),
@@ -4032,6 +3933,13 @@ mod tests {
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            // so that all uploads finish & we can call harness.load() below again
+            tenant
+                .shutdown(Default::default(), true)
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+                .await
+                .ok()
+                .unwrap();
        }

        let (tenant, _ctx) = harness.load().await;
@@ -4065,6 +3973,14 @@ mod tests {
                .expect("Should have a local timeline");

            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+
+            // so that all uploads finish & we can call harness.load() below again
+            tenant
+                .shutdown(Default::default(), true)
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+                .await
+                .ok()
+                .unwrap();
        }

        // check that both of them are initially unloaded
@@ -4117,6 +4033,13 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        drop(tline);
+        // so that all uploads finish & we can call harness.try_load() below again
+        tenant
+            .shutdown(Default::default(), true)
+            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
+            .await
+            .ok()
+            .unwrap();
        drop(tenant);

        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -4128,11 +4051,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness
-            .try_load(&ctx, None, None)
-            .await
-            .err()
-            .expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4517,8 +4436,9 @@ mod tests {
            .await;

        let initdb_lsn = Lsn(0x20);
-        let utline =
-            tenant.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)?;
+        let utline = tenant
+            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
        let tline = utline.raw_timeline().unwrap();

        // Spawn flush loop now so that we can set the `expect_initdb_optimization`
@@ -4583,9 +4503,15 @@ mod tests {
        let harness = TenantHarness::create(name)?;
        {
            let (tenant, ctx) = harness.load().await;
-            let tline =
-                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tenant
+                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+                .await?;
            // Keeps uninit mark in place
+            let raw_tline = tline.raw_timeline().unwrap();
+            raw_tline
+                .shutdown(false)
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
+                .await;
            std::mem::forget(tline);
        }

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -13,6 +13,7 @@
 //!
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
+use crate::virtual_file::VirtualFile;
 use std::cmp::min;
 use std::io::{Error, ErrorKind};

@@ -83,35 +84,24 @@ impl<'a> BlockCursor<'a> {
    }
 }

+/// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
-/// Abstract trait for a data sink that you can write blobs to.
-///
-pub trait BlobWriter {
-    /// Write a blob of data. Returns the offset that it was written to,
-    /// which can be used to retrieve the data later.
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
-}
-
-///
-/// An implementation of BlobWriter to write blobs to anything that
-/// implements std::io::Write.
-///
-pub struct WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    inner: W,
+/// If a `BlobWriter` is dropped, the internal buffer will be
+/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
+/// manually before dropping.
+pub struct BlobWriter<const BUFFERED: bool> {
+    inner: VirtualFile,
    offset: u64,
+    /// A buffer to save on write calls, only used if BUFFERED=true
+    buf: Vec<u8>,
 }

-impl<W> WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    pub fn new(inner: W, start_offset: u64) -> Self {
-        WriteBlobWriter {
+impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
+    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+        Self {
            inner,
            offset: start_offset,
+            buf: Vec::with_capacity(Self::CAPACITY),
        }
    }

@@ -119,28 +109,79 @@ where
        self.offset
    }

-    /// Access the underlying Write object.
-    ///
-    /// NOTE: WriteBlobWriter keeps track of the current write offset. If
-    /// you write something directly to the inner Write object, it makes the
-    /// internally tracked 'offset' to go out of sync. So don't do that.
-    pub fn into_inner(self) -> W {
-        self.inner
-    }
-}
+    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

-impl<W> BlobWriter for WriteBlobWriter<W>
-where
-    W: std::io::Write,
-{
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    #[inline(always)]
+    /// Writes the given buffer directly to the underlying `VirtualFile`.
+    /// You need to make sure that the internal buffer is empty, otherwise
+    /// data will be written in wrong order.
+    async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
+        self.inner.write_all(src_buf).await?;
+        self.offset += src_buf.len() as u64;
+        Ok(())
+    }
+
+    #[inline(always)]
+    /// Flushes the internal buffer to the underlying `VirtualFile`.
+    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
+        self.inner.write_all(&self.buf).await?;
+        self.buf.clear();
+        Ok(())
+    }
+
+    #[inline(always)]
+    /// Writes as much of `src_buf` into the internal buffer as it fits
+    fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
+        let remaining = Self::CAPACITY - self.buf.len();
+        let to_copy = src_buf.len().min(remaining);
+        self.buf.extend_from_slice(&src_buf[..to_copy]);
+        self.offset += to_copy as u64;
+        to_copy
+    }
+
+    /// Internal, possibly buffered, write function
+    async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
+        if !BUFFERED {
+            assert!(self.buf.is_empty());
+            self.write_all_unbuffered(src_buf).await?;
+            return Ok(());
+        }
+        let remaining = Self::CAPACITY - self.buf.len();
+        // First try to copy as much as we can into the buffer
+        if remaining > 0 {
+            let copied = self.write_into_buffer(src_buf);
+            src_buf = &src_buf[copied..];
+        }
+        // Then, if the buffer is full, flush it out
+        if self.buf.len() == Self::CAPACITY {
+            self.flush_buffer().await?;
+        }
+        // Finally, write the tail of src_buf:
+        // If it wholly fits into the buffer without
+        // completely filling it, then put it there.
+        // If not, write it out directly.
+        if !src_buf.is_empty() {
+            assert_eq!(self.buf.len(), 0);
+            if src_buf.len() < Self::CAPACITY {
+                let copied = self.write_into_buffer(src_buf);
+                // We just verified above that src_buf fits into our internal buffer.
+                assert_eq!(copied, src_buf.len());
+            } else {
+                self.write_all_unbuffered(src_buf).await?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
        let offset = self.offset;

        if srcbuf.len() < 128 {
            // Short blob. Write a 1-byte length header
            let len_buf = srcbuf.len() as u8;
-            self.inner.write_all(&[len_buf])?;
-            self.offset += 1;
+            self.write_all(&[len_buf]).await?;
        } else {
            // Write a 4-byte length header
            if srcbuf.len() > 0x7fff_ffff {
@@ -151,11 +192,153 @@ where
            }
            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
            len_buf[0] |= 0x80;
-            self.inner.write_all(&len_buf)?;
-            self.offset += 4;
+            self.write_all(&len_buf).await?;
        }
-        self.inner.write_all(srcbuf)?;
-        self.offset += srcbuf.len() as u64;
+        self.write_all(srcbuf).await?;
        Ok(offset)
    }
 }
+
+impl BlobWriter<true> {
+    /// Access the underlying `VirtualFile`.
+    ///
+    /// This function flushes the internal buffer before giving access
+    /// to the underlying `VirtualFile`.
+    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
+        self.flush_buffer().await?;
+        Ok(self.inner)
+    }
+
+    /// Access the underlying `VirtualFile`.
+    ///
+    /// Unlike [`into_inner`](Self::into_inner), this doesn't flush
+    /// the internal buffer before giving access.
+    pub fn into_inner_no_flush(self) -> VirtualFile {
+        self.inner
+    }
+}
+
+impl BlobWriter<false> {
+    /// Access the underlying `VirtualFile`.
+    pub fn into_inner(self) -> VirtualFile {
+        self.inner
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::tenant::block_io::BlockReaderRef;
+    use rand::{Rng, SeedableRng};
+
+    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        let temp_dir = tempfile::tempdir()?;
+        let path = temp_dir.path().join("file");
+
+        // Write part (in block to drop the file)
+        let mut offsets = Vec::new();
+        {
+            let file = VirtualFile::create(&path).await?;
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            for blob in blobs.iter() {
+                let offs = wtr.write_blob(blob).await?;
+                offsets.push(offs);
+            }
+            // Write out one page worth of zeros so that we can
+            // read again with read_blk
+            let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
+            println!("Writing final blob at offs={offs}");
+            wtr.flush_buffer().await?;
+        }
+
+        let file = VirtualFile::open(&path).await?;
+        let rdr = BlockReaderRef::VirtualFile(&file);
+        let rdr = BlockCursor::new(rdr);
+        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
+            let blob_read = rdr.read_blob(*offset).await?;
+            assert_eq!(
+                blob, &blob_read,
+                "mismatch for idx={idx} at offset={offset}"
+            );
+        }
+        Ok(())
+    }
+
+    fn random_array(len: usize) -> Vec<u8> {
+        let mut rng = rand::thread_rng();
+        (0..len).map(|_| rng.gen()).collect::<_>()
+    }
+
+    #[tokio::test]
+    async fn test_one() -> Result<(), Error> {
+        let blobs = &[vec![12, 21, 22]];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_hello_simple() -> Result<(), Error> {
+        let blobs = &[
+            vec![0, 1, 2, 3],
+            b"Hello, World!".to_vec(),
+            Vec::new(),
+            b"foobar".to_vec(),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_really_big_array() -> Result<(), Error> {
+        let blobs = &[
+            b"test".to_vec(),
+            random_array(10 * PAGE_SZ),
+            b"foobar".to_vec(),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_inc() -> Result<(), Error> {
+        let blobs = (0..PAGE_SZ / 8)
+            .map(|v| random_array(v * 16))
+            .collect::<Vec<_>>();
+        round_trip_test::<false>(&blobs).await?;
+        round_trip_test::<true>(&blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_random_size() -> Result<(), Error> {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
+        let blobs = (0..1024)
+            .map(|_| {
+                let mut sz: u16 = rng.gen();
+                // Make 50% of the arrays small
+                if rng.gen() {
+                    sz |= 63;
+                }
+                random_array(sz.into())
+            })
+            .collect::<Vec<_>>();
+        round_trip_test::<false>(&blobs).await?;
+        round_trip_test::<true>(&blobs).await?;
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_arrays_page_boundary() -> Result<(), Error> {
+        let blobs = &[
+            random_array(PAGE_SZ - 4),
+            random_array(PAGE_SZ - 4),
+            random_array(PAGE_SZ - 4),
+        ];
+        round_trip_test::<false>(blobs).await?;
+        round_trip_test::<true>(blobs).await?;
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -7,9 +7,7 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::fs::File;
 use std::ops::{Deref, DerefMut};
-use std::os::unix::fs::FileExt;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -73,12 +71,13 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
-    FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
+    FileBlockReaderVirtual(&'a FileBlockReader),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
+    #[cfg(test)]
+    VirtualFile(&'a VirtualFile),
 }

 impl<'a> BlockReaderRef<'a> {
@@ -87,11 +86,12 @@ impl<'a> BlockReaderRef<'a> {
        use BlockReaderRef::*;
        match self {
            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
-            FileBlockReaderFile(r) => r.read_blk(blknum).await,
            EphemeralFile(r) => r.read_blk(blknum).await,
            Adapter(r) => r.read_blk(blknum).await,
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
+            #[cfg(test)]
+            VirtualFile(r) => r.read_blk(blknum).await,
        }
    }
 }
@@ -105,7 +105,7 @@ impl<'a> BlockReaderRef<'a> {
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
-/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
+/// # let reader: FileBlockReader = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
 /// let buf = cursor.read_blk(1);
 /// // do stuff with 'buf'
@@ -122,7 +122,7 @@ impl<'a> BlockCursor<'a> {
        BlockCursor { reader }
    }
    // Needed by cli
-    pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
+    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
        }
@@ -143,27 +143,26 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader<F> {
-    pub file: F,
+pub struct FileBlockReader {
+    pub file: VirtualFile,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl<F> FileBlockReader<F>
-where
-    F: FileExt,
-{
-    pub fn new(file: F) -> Self {
+impl FileBlockReader {
+    pub fn new(file: VirtualFile) -> Self {
        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }

    /// Read a page from the underlying file into given buffer.
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
-        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+        self.file
+            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .await
    }
    /// Read a block.
    ///
@@ -185,7 +184,7 @@ where
                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
                    write_guard.mark_valid();

                    // Swap for read lock
@@ -196,13 +195,7 @@ where
    }
 }

-impl BlockReader for FileBlockReader<File> {
-    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
-    }
-}
-
-impl BlockReader for FileBlockReader<VirtualFile> {
+impl BlockReader for FileBlockReader {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -9,7 +9,6 @@ use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
@@ -29,7 +28,7 @@ pub struct EphemeralFile {
 }

 impl EphemeralFile {
-    pub fn create(
+    pub async fn create(
        conf: &PageServerConf,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -45,7 +44,8 @@ impl EphemeralFile {
        let file = VirtualFile::open_with_options(
            &filename,
            OpenOptions::new().read(true).write(true).create(true),
-        )?;
+        )
+        .await?;

        Ok(EphemeralFile {
            page_cache_file_id: page_cache::next_file_id(),
@@ -88,7 +88,8 @@ impl EphemeralFile {
                        let buf: &mut [u8] = write_guard.deref_mut();
                        debug_assert_eq!(buf.len(), PAGE_SZ);
                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                            .await?;
                        write_guard.mark_valid();

                        // Swap for read lock
@@ -128,10 +129,15 @@ impl EphemeralFile {
                    self.off += n;
                    src_remaining = &src_remaining[n..];
                    if self.off == PAGE_SZ {
-                        match self.ephemeral_file.file.write_all_at(
-                            &self.ephemeral_file.mutable_tail,
-                            self.blknum as u64 * PAGE_SZ as u64,
-                        ) {
+                        match self
+                            .ephemeral_file
+                            .file
+                            .write_all_at(
+                                &self.ephemeral_file.mutable_tail,
+                                self.blknum as u64 * PAGE_SZ as u64,
+                            )
+                            .await
+                        {
                            Ok(_) => {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -281,7 +287,7 @@ mod tests {
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

        let pos_foo = file.write_blob(b"foo").await?;
        assert_eq!(
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -1,325 +0,0 @@
-//! This module contains the encoding and decoding of the local manifest file.
-//!
-//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
-//! records the state of the storage engine. It contains a snapshot of the
-//! state and all operations proceeding that snapshot. The file begins with a
-//! header recording MANIFEST version number. After that, it contains a snapshot.
-//! The snapshot is followed by a list of operations. Each operation is a list
-//! of records. Each record is either an addition or a removal of a layer.
-//!
-//! With MANIFEST, we can:
-//!
-//! 1. recover state quickly by reading the file, potentially boosting the
-//!    startup speed.
-//! 2. ensure all operations are atomic and avoid corruption, solving issues
-//!    like redundant image layer and preparing us for future compaction
-//!    strategies.
-//!
-//! There is also a format for storing all layer files on S3, called
-//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
-//! records all operations as logs, and therefore we can easily replay the
-//! operations when recovering from crash, while ensuring those operations
-//! are atomic upon restart.
-//!
-//! Currently, this is not used in the system. Future refactors will ensure
-//! the storage state will be recorded in this file, and the system can be
-//! recovered from this file. This is tracked in
-//! <https://github.com/neondatabase/neon/issues/4418>
-
-use std::io::{self, Read, Write};
-
-use crate::virtual_file::VirtualFile;
-use anyhow::Result;
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crc32c::crc32c;
-use serde::{Deserialize, Serialize};
-use tracing::log::warn;
-use utils::lsn::Lsn;
-
-use super::storage_layer::PersistentLayerDesc;
-
-pub struct Manifest {
-    file: VirtualFile,
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct Snapshot {
-    pub layers: Vec<PersistentLayerDesc>,
-}
-
-/// serde by default encode this in tagged enum, and therefore it will be something
-/// like `{ "AddLayer": { ... } }`.
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Record {
-    AddLayer(PersistentLayerDesc),
-    RemoveLayer(PersistentLayerDesc),
-}
-
-/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
-const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
-const MANIFEST_VERSION: u64 = 1;
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub struct ManifestHeader {
-    magic_number: u64,
-    version: u64,
-}
-
-const MANIFEST_HEADER_LEN: usize = 16;
-
-impl ManifestHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
-        buf.put_u64(self.magic_number);
-        buf.put_u64(self.version);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
-        Self {
-            magic_number: buf.get_u64(),
-            version: buf.get_u64(),
-        }
-    }
-}
-
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
-pub enum Operation {
-    /// A snapshot of the current state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
-    Snapshot(Snapshot, Lsn),
-    /// An atomic operation that changes the state.
-    ///
-    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
-    /// This will only change when new L0 is flushed to the disk.
-    Operation(Vec<Record>, Lsn),
-}
-
-struct RecordHeader {
-    size: u32,
-    checksum: u32,
-}
-
-const RECORD_HEADER_LEN: usize = 8;
-
-impl RecordHeader {
-    fn encode(&self) -> BytesMut {
-        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
-        buf.put_u32(self.size);
-        buf.put_u32(self.checksum);
-        buf
-    }
-
-    fn decode(mut buf: &[u8]) -> Self {
-        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
-        Self {
-            size: buf.get_u32(),
-            checksum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum ManifestLoadError {
-    #[error("manifest header is corrupted")]
-    CorruptedManifestHeader,
-    #[error("unsupported manifest version: got {0}, expected {1}")]
-    UnsupportedVersion(u64, u64),
-    #[error("error when decoding record: {0}")]
-    DecodeRecord(serde_json::Error),
-    #[error("I/O error: {0}")]
-    Io(io::Error),
-}
-
-#[must_use = "Should check if the manifest is partially corrupted"]
-pub struct ManifestPartiallyCorrupted(bool);
-
-impl Manifest {
-    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
-    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
-        let mut manifest = Self { file };
-        manifest.append_manifest_header(ManifestHeader {
-            magic_number: MANIFEST_MAGIC_NUMBER,
-            version: MANIFEST_VERSION,
-        })?;
-        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
-        Ok(manifest)
-    }
-
-    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
-    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
-    /// backup the current one.
-    pub fn load(
-        mut file: VirtualFile,
-    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
-        let mut buf = vec![];
-        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
-
-        // Read manifest header
-        let mut buf = Bytes::from(buf);
-        if buf.remaining() < MANIFEST_HEADER_LEN {
-            return Err(ManifestLoadError::CorruptedManifestHeader);
-        }
-        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
-        buf.advance(MANIFEST_HEADER_LEN);
-        if header.version != MANIFEST_VERSION {
-            return Err(ManifestLoadError::UnsupportedVersion(
-                header.version,
-                MANIFEST_VERSION,
-            ));
-        }
-
-        // Read operations
-        let mut operations = Vec::new();
-        let corrupted = loop {
-            if buf.remaining() == 0 {
-                break false;
-            }
-            if buf.remaining() < RECORD_HEADER_LEN {
-                warn!("incomplete header when decoding manifest, could be corrupted");
-                break true;
-            }
-            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
-            let size = size as usize;
-            buf.advance(RECORD_HEADER_LEN);
-            if buf.remaining() < size {
-                warn!("incomplete data when decoding manifest, could be corrupted");
-                break true;
-            }
-            let data = &buf[..size];
-            if crc32c(data) != checksum {
-                warn!("checksum mismatch when decoding manifest, could be corrupted");
-                break true;
-            }
-            // if the following decode fails, we cannot use the manifest or safely ignore any record.
-            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
-            buf.advance(size);
-        };
-        Ok((
-            Self { file },
-            operations,
-            ManifestPartiallyCorrupted(corrupted),
-        ))
-    }
-
-    fn append_data(&mut self, data: &[u8]) -> Result<()> {
-        if data.len() >= u32::MAX as usize {
-            panic!("data too large");
-        }
-        let header = RecordHeader {
-            size: data.len() as u32,
-            checksum: crc32c(data),
-        };
-        let header = header.encode();
-        self.file.write_all(&header)?;
-        self.file.write_all(data)?;
-        self.file.sync_all()?;
-        Ok(())
-    }
-
-    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
-        let encoded = header.encode();
-        self.file.write_all(&encoded)?;
-        Ok(())
-    }
-
-    /// Add an operation to the manifest. The operation will be appended to the end of the file,
-    /// and the file will fsync.
-    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
-        let encoded = Vec::from(serde_json::to_string(&operation)?);
-        self.append_data(&encoded)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::fs::OpenOptions;
-
-    use crate::repository::Key;
-
-    use super::*;
-
-    #[test]
-    fn test_read_manifest() {
-        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
-        std::fs::create_dir_all(&testdir).unwrap();
-        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
-        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
-        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
-        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
-        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
-
-        // Write a manifest with a snapshot and some operations
-        let snapshot = Snapshot {
-            layers: vec![layer1, layer2],
-        };
-        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
-        manifest
-            .append_operation(Operation::Operation(
-                vec![Record::AddLayer(layer3.clone())],
-                Lsn::from(1),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the second time and write
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 2);
-        assert_eq!(
-            &operations[0],
-            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
-        );
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        manifest
-            .append_operation(Operation::Operation(
-                vec![
-                    Record::RemoveLayer(layer3.clone()),
-                    Record::AddLayer(layer4.clone()),
-                ],
-                Lsn::from(2),
-            ))
-            .unwrap();
-        drop(manifest);
-
-        // Open the third time and verify
-        let file = VirtualFile::open_with_options(
-            &testdir.join("MANIFEST"),
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create_new(false)
-                .truncate(false),
-        )
-        .unwrap();
-        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
-        assert!(!corrupted.0);
-        assert_eq!(operations.len(), 3);
-        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
-        assert_eq!(
-            &operations[1],
-            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
-        );
-        assert_eq!(
-            &operations[2],
-            &Operation::Operation(
-                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
-                Lsn::from(2)
-            )
-        );
-    }
-}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,14 +8,13 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client

-use std::fs::{File, OpenOptions};
-use std::io::{self, Write};
+use std::io::{self};

-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
-use tracing::info_span;
 use utils::bin_ser::SerializeError;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
@@ -24,6 +23,7 @@ use utils::{

 use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;
+use crate::TEMP_FILE_SUFFIX;

 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;
@@ -230,6 +230,23 @@ impl TimelineMetadata {
    pub fn pg_version(&self) -> u32 {
        self.body.pg_version
    }
+
+    // Checksums make it awkward to build a valid instance by hand.  This helper
+    // provides a TimelineMetadata with a valid checksum in its header.
+    #[cfg(test)]
+    pub fn example() -> Self {
+        let instance = Self::new(
+            "0/16960E8".parse::<Lsn>().unwrap(),
+            None,
+            None,
+            Lsn::from_hex("00000000").unwrap(),
+            Lsn::from_hex("00000000").unwrap(),
+            Lsn::from_hex("00000000").unwrap(),
+            0,
+        );
+        let bytes = instance.to_bytes().unwrap();
+        Self::from_bytes(&bytes).unwrap()
+    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -255,38 +272,19 @@ impl Serialize for TimelineMetadata {
 }

 /// Save timeline metadata to file
-pub fn save_metadata(
+#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
+pub async fn save_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    data: &TimelineMetadata,
-    first_save: bool,
 ) -> anyhow::Result<()> {
-    let _enter = info_span!("saving metadata").entered();
    let path = conf.metadata_path(tenant_id, timeline_id);
-    // use OpenOptions to ensure file presence is consistent with first_save
-    let mut file = VirtualFile::open_with_options(
-        &path,
-        OpenOptions::new().write(true).create_new(first_save),
-    )
-    .context("open_with_options")?;
-
-    let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
-
-    if file.write(&metadata_bytes)? != metadata_bytes.len() {
-        bail!("Could not write all the metadata bytes in a single call");
-    }
-    file.sync_all()?;
-
-    // fsync the parent directory to ensure the directory entry is durable
-    if first_save {
-        let timeline_dir = File::open(
-            path.parent()
-                .expect("Metadata should always have a parent dir"),
-        )?;
-        timeline_dir.sync_all()?;
-    }
-
+    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
+    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
+    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
+        .await
+        .context("write metadata")?;
    Ok(())
 }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,19 +1,17 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use hyper::StatusCode;
-use pageserver_api::control_api::{HexTenantId, ReAttachRequest, ReAttachResponse};
 use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
 use std::path::Path;
 use std::sync::Arc;
-use std::time::Duration;
 use tokio::fs;

 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -21,13 +19,14 @@ use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::deletion_queue::DeletionQueue;
+use crate::control_plane_client::ControlPlaneClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

+use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
@@ -64,6 +63,29 @@ impl TenantsMap {
    }
 }

+/// This is "safe" in that that it won't leave behind a partially deleted directory
+/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
+/// the contents.
+///
+/// This is pageserver-specific, as it relies on future processes after a crash to check
+/// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
+    let parent = path
+        .as_ref()
+        .parent()
+        // It is invalid to call this function with a relative path.  Tenant directories
+        // should always have a parent.
+        .ok_or(std::io::Error::new(
+            std::io::ErrorKind::InvalidInput,
+            "Path must be absolute",
+        ))?;
+
+    let tmp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
+    fs::rename(&path, &tmp_path).await?;
+    fs::File::open(parent).await?.sync_all().await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

 /// Initialize repositories with locally available timelines.
@@ -74,82 +96,19 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    resources: TenantSharedResources,
    init_order: InitializationOrder,
+    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();

    let mut tenants = HashMap::new();

-    // If we are configured to use the control plane API, then it is the source of truth for what to attach
-    let tenant_generations = conf
-        .control_plane_api
-        .as_ref()
-        .map(|control_plane_api| async {
-            let client = reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client");
-
-            // FIXME: it's awkward that join() requires the base to have a trailing slash, makes
-            // it easy to get a config wrong
-            assert!(
-                control_plane_api.as_str().ends_with("/"),
-                "control plane API needs trailing slash"
-            );
-
-            let re_attach_path = control_plane_api
-                .join("re-attach")
-                .expect("Failed to build re-attach path");
-            let request = ReAttachRequest { node_id: conf.id };
-
-            // TODO: we should have been passed a cancellation token, and use it to end
-            // this loop gracefully
-            loop {
-                let response = match client
-                    .post(re_attach_path.clone())
-                    .json(&request)
-                    .send()
-                    .await
-                {
-                    Err(e) => Err(anyhow::Error::from(e)),
-                    Ok(r) => {
-                        if r.status() == StatusCode::OK {
-                            r.json::<ReAttachResponse>()
-                                .await
-                                .map_err(|e| anyhow::Error::from(e))
-                        } else {
-                            Err(anyhow::anyhow!("Unexpected status {}", r.status()))
-                        }
-                    }
-                };
-
-                match response {
-                    Ok(res) => {
-                        tracing::info!(
-                            "Received re-attach response with {0} tenants",
-                            res.tenants.len()
-                        );
-
-                        // TODO: do something with it
-                        break res
-                            .tenants
-                            .into_iter()
-                            .map(|t| (t.id, t.generation))
-                            .collect::<HashMap<_, _>>();
-                    }
-                    Err(e) => {
-                        tracing::error!("Error re-attaching tenants, retrying: {e:#}");
-                        tokio::time::sleep(Duration::from_secs(1)).await;
-                    }
-                }
-            }
-        });
-
-    let tenant_generations = match tenant_generations {
-        Some(g) => Some(g.await),
-        None => {
-            info!("Control plane API not configured, tenant generations are disabled");
-            None
-        }
+    // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
+    let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
+        Some(client.re_attach().await?)
+    } else {
+        info!("Control plane API not configured, tenant generations are disabled");
+        None
    };

    let mut dir_entries = fs::read_dir(&tenants_dir)
@@ -168,6 +127,8 @@ pub async fn init_tenant_mgr(
                        "Found temporary tenant directory, removing: {}",
                        tenant_dir_path.display()
                    );
+                    // No need to use safe_remove_tenant_dir_all because this is already
+                    // a temporary path
                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
                        error!(
                            "Failed to remove temporary directory '{}': {:?}",
@@ -208,7 +169,7 @@ pub async fn init_tenant_mgr(
                        Ok(id) => id,
                        Err(_) => {
                            warn!(
-                                "Invalid tenant path (garbage in our repo directory?): {0}",
+                                "Invalid tenant path (garbage in our repo directory?): {}",
                                tenant_dir_path.display()
                            );
                            continue;
@@ -218,11 +179,11 @@ pub async fn init_tenant_mgr(
                    let generation = if let Some(generations) = &tenant_generations {
                        // We have a generation map: treat it as the authority for whether
                        // this tenant is really attached.
-                        if let Some(gen) = generations.get(&HexTenantId::new(tenant_id)) {
-                            Generation::new(*gen)
+                        if let Some(gen) = generations.get(&tenant_id) {
+                            *gen
                        } else {
-                            info!("Detaching tenant {0}, control plane omitted it in re-attach response", tenant_id);
-                            if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
+                            info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
+                            if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
                                error!(
                                    "Failed to remove detached tenant directory '{}': {:?}",
                                    tenant_dir_path.display(),
@@ -235,7 +196,7 @@ pub async fn init_tenant_mgr(
                        // Legacy mode: no generation information, any tenant present
                        // on local disk may activate
                        info!(
-                            "Starting tenant {0} in legacy mode, no generation",
+                            "Starting tenant {} in legacy mode, no generation",
                            tenant_dir_path.display()
                        );
                        Generation::none()
@@ -279,6 +240,7 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

+#[allow(clippy::too_many_arguments)]
 pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -320,7 +282,6 @@ pub(crate) fn schedule_local_tenant_processing(
                resources.broker_client,
                tenants,
                remote_storage,
-                resources.deletion_queue_client,
                ctx,
            ) {
                Ok(tenant) => tenant,
@@ -468,21 +429,19 @@ pub async fn create_tenant(
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
+    tenant_map_insert(tenant_id, || async {
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

        let tenant_resources = TenantSharedResources {
            broker_client,
            remote_storage,
-            deletion_queue_client: deletion_queue.new_client(),
        };
        let created_tenant =
            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
@@ -516,7 +475,8 @@ pub async fn set_new_tenant_config(
    let tenant = get_tenant(tenant_id, true).await?;

    let tenant_config_path = conf.tenant_config_path(&tenant_id);
-    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
+    Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
+        .await
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
    Ok(())
@@ -602,7 +562,7 @@ async fn detach_tenant0(
 ) -> Result<(), TenantStateError> {
    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        fs::remove_dir_all(&local_tenant_directory)
+        safe_remove_tenant_dir_all(&local_tenant_directory)
            .await
            .with_context(|| {
                format!("local tenant directory {local_tenant_directory:?} removal")
@@ -633,12 +593,12 @@ async fn detach_tenant0(
 pub async fn load_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
+    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
+    tenant_map_insert(tenant_id, || async {
        let tenant_path = conf.tenant_path(&tenant_id);
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
@@ -649,11 +609,8 @@ pub async fn load_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage,
-            deletion_queue_client: deletion_queue.new_client(),
        };
-        // TODO: remove the `/load` API once generation support is complete:
-        // it becomes equivalent to attaching.
-        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, Generation::none(), resources, None,  &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, generation, resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -721,11 +678,10 @@ pub async fn attach_tenant(
    tenant_conf: TenantConfOpt,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: GenericRemoteStorage,
-    deletion_queue: &DeletionQueue,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    tenant_map_insert(tenant_id, || {
-        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
+    tenant_map_insert(tenant_id, || async {
+        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

@@ -739,7 +695,6 @@ pub async fn attach_tenant(
        let resources = TenantSharedResources {
            broker_client,
            remote_storage: Some(remote_storage),
-            deletion_queue_client: deletion_queue.new_client(),
        };
        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, generation, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -774,12 +729,13 @@ pub enum TenantMapInsertError {
 ///
 /// NB: the closure should return quickly because the current implementation of tenants map
 /// serializes access through an `RwLock`.
-async fn tenant_map_insert<F>(
+async fn tenant_map_insert<F, R>(
    tenant_id: TenantId,
    insert_fn: F,
 ) -> Result<Arc<Tenant>, TenantMapInsertError>
 where
-    F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
+    F: FnOnce() -> R,
+    R: std::future::Future<Output = anyhow::Result<Arc<Tenant>>>,
 {
    let mut guard = TENANTS.write().await;
    let m = match &mut *guard {
@@ -792,7 +748,7 @@ where
            tenant_id,
            e.get().current_state(),
        )),
-        hash_map::Entry::Vacant(v) => match insert_fn() {
+        hash_map::Entry::Vacant(v) => match insert_fn().await {
            Ok(tenant) => {
                v.insert(tenant.clone());
                Ok(tenant)
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -4,10 +4,9 @@ use std::{
    sync::atomic::{AtomicUsize, Ordering},
 };

-use crate::virtual_file::VirtualFile;
-
 fn fsync_path(path: &Path) -> io::Result<()> {
-    let file = VirtualFile::open(path)?;
+    // TODO use VirtualFile::fsync_all once we fully go async.
+    let file = std::fs::File::open(path)?;
    file.sync_all()
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -56,11 +56,9 @@
 //! # Consistency
 //!
 //! To have a consistent remote structure, it's important that uploads and
-//! deletions are performed in the right order. For example:
-//! - the index file contains a list of layer files, so it must not be uploaded
-//!    until all the layer files that are in its list have been successfully uploaded.
-//! - objects must be removed from the index before being deleted, and that updated
-//!   index must be written to remote storage before deleting the objects from remote storage.
+//! deletions are performed in the right order. For example, the index file
+//! contains a list of layer files, so it must not be uploaded until all the
+//! layer files that are in its list have been successfully uploaded.
 //!
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
@@ -72,12 +70,10 @@
 //! correct order, and the client will parallelize the operations in a way that
 //! is safe.
 //!
-//! The caller should be careful with deletion, though:
-//! - they should not delete local files that have been scheduled for upload but
-//!   not yet finished uploading.  Otherwise the upload will fail. To wait for an
-//!   upload to finish, use the 'wait_completion' function (more on that later.)
-//! - they should not to remote deletions via DeletionQueue without waiting for
-//!   the latest metadata to upload via RemoteTimelineClient.
+//! The caller should be careful with deletion, though. They should not delete
+//! local files that have been scheduled for upload but not yet finished uploading.
+//! Otherwise the upload will fail. To wait for an upload to finish, use
+//! the 'wait_completion' function (more on that later.)
 //!
 //! All of this relies on the following invariants:
 //!
@@ -204,11 +200,12 @@
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

+mod delete;
 mod download;
 pub mod index;
 mod upload;

-use anyhow::{bail, Context};
+use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
@@ -229,7 +226,6 @@ use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -238,6 +234,8 @@ use crate::metrics::{
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::upload_queue::Delete;
+use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
    config::PageServerConf,
    task_mgr,
@@ -247,7 +245,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    tenant::TIMELINES_SEGMENT_NAME,
 };

 use utils::id::{TenantId, TimelineId};
@@ -345,7 +342,12 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
+            runtime: if cfg!(test) {
+                // remote_timeline_client.rs tests rely on current-thread runtime
+                tokio::runtime::Handle::current()
+            } else {
+                BACKGROUND_RUNTIME.handle().clone()
+            },
            tenant_id,
            timeline_id,
            generation,
@@ -457,7 +459,6 @@ impl RemoteTimelineClient {
        );

        let index_part = download::download_index_part(
-            self.conf,
            &self.storage_impl,
            &self.tenant_id,
            &self.timeline_id,
@@ -640,36 +641,44 @@ impl RemoteTimelineClient {
    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
-    pub async fn schedule_layer_file_deletion(
+    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
        names: &[LayerFileName],
-        deletion_queue_client: &DeletionQueueClient,
    ) -> anyhow::Result<()> {
-        // Synchronous update of upload queues under mutex
-        let with_generations = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;

-            // Deleting layers doesn't affect the values stored in TimelineMetadata,
-            // so we don't need update it. Just serialize it.
-            let metadata = upload_queue.latest_metadata.clone();
+        // Deleting layers doesn't affect the values stored in TimelineMetadata,
+        // so we don't need update it. Just serialize it.
+        let metadata = upload_queue.latest_metadata.clone();

+        // Update the remote index file, removing the to-be-deleted files from the index,
+        // before deleting the actual files.
+        //
+        // Once we start removing files from upload_queue.latest_files, there's
+        // no going back! Otherwise, some of the files would already be removed
+        // from latest_files, but not yet scheduled for deletion. Use a closure
+        // to syntactically forbid ? or bail! calls here.
+        let no_bail_here = || {
            // Decorate our list of names with each name's generation, dropping
            // makes that are unexpectedly missing from our metadata.
            let with_generations: Vec<_> = names
-                .into_iter()
+                .iter()
                .filter_map(|name| {
                    // Remove from latest_files, learning the file's remote generation in the process
                    let meta = upload_queue.latest_files.remove(name);

                    if let Some(meta) = meta {
                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name.clone(), meta.generation))
+                        Some((name, meta.generation))
                    } else {
-                        // This is unexpected: latest_files is meant to be kept up to
-                        // date.  We can't delete the layer if we have forgotten what
-                        // generation it was in.
-                        warn!("Deleting layer {name} not found in latest_files list");
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
                        None
                    }
                })
@@ -679,27 +688,23 @@ impl RemoteTimelineClient {
                self.schedule_index_upload(upload_queue, metadata);
            }

-            with_generations
+            // schedule the actual deletions
+            for (name, generation) in with_generations {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                    generation,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                upload_queue.queued_operations.push_back(op);
+                info!("scheduled layer file deletion {name}");
+            }
+
+            // Launch the tasks immediately, if possible
+            self.launch_queued_tasks(upload_queue);
        };
-
-        // Barrier: we must ensure all prior uploads and index writes have landed in S3
-        // before emitting deletions.
-        if let Err(e) = self.wait_completion().await {
-            // This can only fail if upload queue is shut down: if this happens, we do
-            // not emit any deletions.  In this condition (remote client is shut down
-            // during compaction or GC) we may leak some objects.
-            bail!("Cannot complete layer file deletions during shutdown ({e})");
-        }
-
-        // Enqueue deletions
-        deletion_queue_client
-            .push_layers(
-                self.tenant_id,
-                self.timeline_id,
-                self.generation,
-                with_generations,
-            )
-            .await?;
+        no_bail_here();
        Ok(())
    }

@@ -825,13 +830,12 @@ impl RemoteTimelineClient {
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
-    pub(crate) async fn delete_all(
-        self: &Arc<Self>,
-        deletion_queue: &DeletionQueueClient,
-    ) -> anyhow::Result<()> {
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let layers: Vec<_> = {
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
            let mut locked = self.upload_queue.lock().unwrap();
            let stopped = locked.stopped_mut()?;

@@ -843,30 +847,42 @@ impl RemoteTimelineClient {

            stopped
                .upload_queue_for_deletion
-                .latest_files
-                .drain()
-                .map(|kv| (kv.0, kv.1.generation))
-                .collect()
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for (name, meta) in &stopped.upload_queue_for_deletion.latest_files {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                    generation: meta.generation,
+                });
+
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {name}");
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
        };

-        let layer_deletion_count = layers.len();
-
-        let layer_paths = layers
-            .into_iter()
-            .map(|(layer, generation)| {
-                remote_layer_path(&self.tenant_id, &self.timeline_id, &layer, generation)
-            })
-            .collect();
-        deletion_queue.push_immediate(layer_paths).await?;
+        receiver.changed().await.context("upload queue shut down")?;

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we prroceed to do a list_prefixes below, we aren't
-        // taking the burden of listing all the layers that we already know we should delete.
-        deletion_queue.flush_immediate().await?;
-
        let remaining = backoff::retry(
            || async {
                self.storage_impl
@@ -894,9 +910,17 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining.len();
        if !remaining.is_empty() {
-            deletion_queue.push_immediate(remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+                backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -907,14 +931,18 @@ impl RemoteTimelineClient {

        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

-        debug!("enqueuing index part deletion");
-        deletion_queue
-            .push_immediate([index_file_path].to_vec())
-            .await?;
+        debug!("deleting index part");

-        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
-        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        deletion_queue.flush_immediate().await?;
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -922,7 +950,7 @@ impl RemoteTimelineClient {
            ))?
        });

-        info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
+        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");

        Ok(())
    }
@@ -945,6 +973,10 @@ impl RemoteTimelineClient {
                    // have finished.
                    upload_queue.inprogress_tasks.is_empty()
                }
+                UploadOp::Delete(_) => {
+                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
+                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
+                }

                UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
            };
@@ -972,6 +1004,9 @@ impl RemoteTimelineClient {
                UploadOp::UploadMetadata(_, _) => {
                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
+                UploadOp::Delete(_) => {
+                    upload_queue.num_inprogress_deletions += 1;
+                }
                UploadOp::Barrier(sender) => {
                    sender.send_replace(());
                    continue;
@@ -1105,6 +1140,21 @@ impl RemoteTimelineClient {
                    }
                    res
                }
+                UploadOp::Delete(delete) => {
+                    let path = &self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(delete.layer_file_name.file_name());
+                    delete::delete_layer(self.conf, &self.storage_impl, path, delete.generation)
+                        .measure_remote_op(
+                            self.tenant_id,
+                            self.timeline_id,
+                            delete.file_kind,
+                            RemoteOpKind::Delete,
+                            Arc::clone(&self.metrics),
+                        )
+                        .await
+                }
                UploadOp::Barrier(_) => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
@@ -1164,7 +1214,15 @@ impl RemoteTimelineClient {
            let mut upload_queue_guard = self.upload_queue.lock().unwrap();
            let upload_queue = match upload_queue_guard.deref_mut() {
                UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => { None }
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
+                },
                UploadQueue::Initialized(qi) => { Some(qi) }
            };

@@ -1186,6 +1244,9 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_metadata_uploads -= 1;
                    upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                }
+                UploadOp::Delete(_) => {
+                    upload_queue.num_inprogress_deletions -= 1;
+                }
                UploadOp::Barrier(_) => unreachable!(),
            };

@@ -1217,6 +1278,13 @@ impl RemoteTimelineClient {
                    reason: "metadata uploads are tiny",
                },
            ),
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
            UploadOp::Barrier(_) => {
                // we do not account these
                return None;
@@ -1276,6 +1344,7 @@ impl RemoteTimelineClient {
                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
                        num_inprogress_layer_uploads: 0,
                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                    };
@@ -1296,7 +1365,9 @@ impl RemoteTimelineClient {

                // consistency check
                assert_eq!(
-                    qi.num_inprogress_layer_uploads + qi.num_inprogress_metadata_uploads,
+                    qi.num_inprogress_layer_uploads
+                        + qi.num_inprogress_metadata_uploads
+                        + qi.num_inprogress_deletions,
                    qi.inprogress_tasks.len()
                );

@@ -1334,13 +1405,13 @@ pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerFileName,
-    generation: Generation,
+    layer_meta: &LayerFileMetadata,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        layer_file_name.file_name(),
-        generation.get_suffix()
+        layer_meta.generation.get_suffix()
    );

    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1359,6 +1430,30 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

+/// Given the key of an index, parse out the generation part of the name
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+    let file_name = match path.get_path().file_name() {
+        Some(f) => f,
+        None => {
+            // Unexpected: we should be seeing index_part.json paths only
+            tracing::warn!("Malformed index key {}", path);
+            return None;
+        }
+    };
+
+    let file_name_str = match file_name.to_str() {
+        Some(s) => s,
+        None => {
+            tracing::warn!("Malformed index key {:?}", path);
+            return None;
+        }
+    };
+    match file_name_str.split_once('-') {
+        Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
+        None => None,
+    }
+}
+
 /// Files on the remote storage are stored with paths, relative to the workdir.
 /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
 ///
@@ -1366,25 +1461,21 @@ pub fn remote_index_path(
 pub fn remote_path(
    conf: &PageServerConf,
    local_path: &Path,
-    generation: Option<Generation>,
+    generation: Generation,
 ) -> anyhow::Result<RemotePath> {
    let stripped = local_path
        .strip_prefix(&conf.workdir)
        .context("Failed to strip workdir prefix")?;

-    let suffixed = if let Some(generation) = generation {
-        format!(
-            "{0}{1}",
-            stripped.to_string_lossy(),
-            generation.get_suffix()
-        )
-    } else {
-        stripped.to_string_lossy().to_string()
-    };
+    let suffixed = format!(
+        "{0}{1}",
+        stripped.to_string_lossy(),
+        generation.get_suffix()
+    );

    RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
        format!(
-            "Failed to resolve remote part of path {:?} for base {:?}",
+            "to resolve remote part of path {:?} for base {:?}",
            local_path, conf.workdir
        )
    })
@@ -1395,18 +1486,14 @@ mod tests {
    use super::*;
    use crate::{
        context::RequestContext,
-        deletion_queue::mock::MockDeletionQueue,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
-    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
-    use std::{
-        collections::HashSet,
-        path::{Path, PathBuf},
-    };
+
+    use std::{collections::HashSet, path::Path};
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1463,9 +1550,6 @@ mod tests {
        tenant: Arc<Tenant>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
-        remote_fs_dir: PathBuf,
-        client: Arc<RemoteTimelineClient>,
-        deletion_queue: MockDeletionQueue,
    }

    impl TestSetup {
@@ -1475,57 +1559,44 @@ mod tests {
            let harness = TenantHarness::create(test_name)?;
            let (tenant, ctx) = harness.load().await;

-            // create an empty timeline directory
            let timeline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
                .await?;

-            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
-            std::fs::create_dir_all(remote_fs_dir)?;
-            let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
-
-            let storage_config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(
-                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                )
-                .unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(
-                    remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                )
-                .unwrap(),
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
-            };
-
-            let generation = Generation::new(0xdeadbeef);
-
-            let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
-
-            let client = Arc::new(RemoteTimelineClient {
-                conf: harness.conf,
-                runtime: tokio::runtime::Handle::current(),
-                tenant_id: harness.tenant_id,
-                timeline_id: TIMELINE_ID,
-                generation,
-                storage_impl: storage.clone(),
-                upload_queue: Mutex::new(UploadQueue::Uninitialized),
-                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &harness.tenant_id,
-                    &TIMELINE_ID,
-                )),
-            });
-
-            let deletion_queue = MockDeletionQueue::new(Some(storage));
-
            Ok(Self {
                harness,
                tenant,
                timeline,
                tenant_ctx: ctx,
-                remote_fs_dir,
-                client,
-                deletion_queue,
            })
        }
+
+        /// Construct a RemoteTimelineClient in an arbitrary generation
+        fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
+            Arc::new(RemoteTimelineClient {
+                conf: self.harness.conf,
+                runtime: tokio::runtime::Handle::current(),
+                tenant_id: self.harness.tenant_id,
+                timeline_id: TIMELINE_ID,
+                generation,
+                storage_impl: self.harness.remote_storage.clone(),
+                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &self.harness.tenant_id,
+                    &TIMELINE_ID,
+                )),
+            })
+        }
+
+        /// A tracing::Span that satisfies remote_timeline_client methods that assert tenant_id
+        /// and timeline_id are present.
+        fn span(&self) -> tracing::Span {
+            tracing::info_span!(
+                "test",
+                tenant_id = %self.harness.tenant_id,
+                timeline_id = %TIMELINE_ID
+            )
+        }
    }

    // Test scheduling
@@ -1545,30 +1616,44 @@ mod tests {
        // Schedule another deletion. Check that it's launched immediately.
        // Schedule index upload. Check that it's queued

+        let test_setup = TestSetup::new("upload_scheduling").await.unwrap();
+        let span = test_setup.span();
+        let _guard = span.enter();
+
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
+            timeline,
            tenant_ctx: _tenant_ctx,
-            remote_fs_dir,
-            client,
-            deletion_queue,
-        } = TestSetup::new("upload_scheduling").await.unwrap();
+        } = test_setup;
+
+        let client = timeline.remote_client.as_ref().unwrap();
+
+        // Download back the index.json, and check that the list of files is correct
+        let initial_index_part = match client.download_index_file().await.unwrap() {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
+        };
+        let initial_layers = initial_index_part
+            .layer_metadata
+            .keys()
+            .map(|f| f.to_owned())
+            .collect::<HashSet<LayerFileName>>();
+        let initial_layer = {
+            assert!(initial_layers.len() == 1);
+            initial_layers.into_iter().next().unwrap()
+        };

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

-        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
+        let remote_timeline_dir = harness
+            .remote_fs_dir
+            .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

-        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
-
-        let generation = Generation::new(0xdeadbeef);
+        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1649,6 +1734,7 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
+                &initial_layer.file_name(),
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
            ],
@@ -1662,68 +1748,37 @@ mod tests {
                &LayerFileMetadata::new(content_3.len() as u64, generation),
            )
            .unwrap();
-
-        {
-            let mut guard = client.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut().unwrap();
-            assert_eq!(upload_queue.queued_operations.len(), 0);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-        }
-
-        assert_remote_files(
-            &[
-                &layer_file_name_1.file_name(),
-                &layer_file_name_2.file_name(),
-                "index_part.json",
-            ],
-            &remote_timeline_dir,
-            generation,
-        );
-
        client
-            .schedule_layer_file_deletion(
-                &[layer_file_name_1.clone()],
-                &deletion_queue.new_client(),
-            )
-            .await
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
            .unwrap();
-
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

-            // Deletion schedules upload of the index file via RemoteTimelineClient, and
-            // deletion of layer files via DeletionQueue.  The uploads have all been flushed
-            // because schedule_layer_file_deletion does a wait_completion before pushing
-            // to the deletion_queue
-            assert_eq!(upload_queue.queued_operations.len(), 0);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 0);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 0);
-            assert_eq!(
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
-                0
-            );
+            // Deletion schedules upload of the index file, and the file deletion itself
+            assert!(upload_queue.queued_operations.len() == 2);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
+            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(
            &[
+                &initial_layer.file_name(),
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
-                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
            generation,
        );

-        // Finish uploads and deletions
+        // Finish them
        client.wait_completion().await.unwrap();
-        deletion_queue.pump().await;
-
-        // 1 layer was deleted
-        assert_eq!(deletion_queue.get_executed(), 1);

        assert_remote_files(
            &[
+                &initial_layer.file_name(),
                &layer_file_name_2.file_name(),
                &layer_file_name_3.file_name(),
                "index_part.json",
@@ -1740,16 +1795,10 @@ mod tests {
        let TestSetup {
            harness,
            tenant: _tenant,
-            timeline: _timeline,
-            client,
+            timeline,
            ..
        } = TestSetup::new("metrics").await.unwrap();
-
-        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
-
+        let client = timeline.remote_client.as_ref().unwrap();
        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1760,11 +1809,20 @@ mod tests {
        )
        .unwrap();

-        #[derive(Debug, PartialEq)]
+        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
            finished: Option<usize>,
        }
+        impl std::ops::Add for BytesStartedFinished {
+            type Output = Self;
+            fn add(self, rhs: Self) -> Self::Output {
+                Self {
+                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
+                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
+                }
+            }
+        }
        let get_bytes_started_stopped = || {
            let started = client
                .metrics
@@ -1781,66 +1839,140 @@ mod tests {
        };

        // Test
+        tracing::info!("now doing actual test");

-        let generation = Generation::new(0xdeadbeef);
-
-        let init = get_bytes_started_stopped();
+        let actual_a = get_bytes_started_stopped();

        client
            .schedule_layer_file_upload(
                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64, generation),
+                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
            )
            .unwrap();

-        let pre = get_bytes_started_stopped();
+        let actual_b = get_bytes_started_stopped();

        client.wait_completion().await.unwrap();

-        let post = get_bytes_started_stopped();
+        let actual_c = get_bytes_started_stopped();

        // Validate

-        assert_eq!(
-            init,
-            BytesStartedFinished {
-                started: None,
-                finished: None
-            }
-        );
-        assert_eq!(
-            pre,
-            BytesStartedFinished {
+        let expected_b = actual_a
+            + BytesStartedFinished {
                started: Some(content_1.len()),
                // assert that the _finished metric is created eagerly so that subtractions work on first sample
                finished: Some(0),
-            }
-        );
-        assert_eq!(
-            post,
-            BytesStartedFinished {
+            };
+        assert_eq!(actual_b, expected_b);
+
+        let expected_c = actual_a
+            + BytesStartedFinished {
                started: Some(content_1.len()),
-                finished: Some(content_1.len())
-            }
-        );
+                finished: Some(content_1.len()),
+            };
+        assert_eq!(actual_c, expected_c);
    }

-    // #[tokio::test]
-    // async fn index_part_download() {
-    //     let TestSetup {
-    //         harness,
-    //         tenant: _tenant,
-    //         timeline: _timeline,
-    //         client,
-    //         ..
-    //     } = TestSetup::new("index_part_download").await.unwrap();
+    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
+        // An empty IndexPart, just sufficient to ensure deserialization will succeed
+        let example_metadata = TimelineMetadata::example();
+        let example_index_part = IndexPart::new(
+            HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+        );

-    //     let example_index_part = IndexPart {
-    //         version: 3,
-    //         timeline_layers: HashSet::new(),
-    //         layer_metadata:
+        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

-    //     }
+        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
+        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
+            timeline_path
+                .strip_prefix(&test_state.harness.conf.workdir)
+                .unwrap(),
+        );

-    // }
+        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
+
+        let index_path = test_state.harness.remote_fs_dir.join(
+            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
+        );
+        eprintln!("Writing {}", index_path.display());
+        std::fs::write(&index_path, index_part_bytes).unwrap();
+        example_index_part
+    }
+
+    /// Assert that when a RemoteTimelineclient in generation `get_generation` fetches its
+    /// index, the IndexPart returned is equal to `expected`
+    async fn assert_got_index_part(
+        test_state: &TestSetup,
+        get_generation: Generation,
+        expected: &IndexPart,
+    ) {
+        let client = test_state.build_client(get_generation);
+
+        let download_r = client
+            .download_index_file()
+            .await
+            .expect("download should always succeed");
+        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
+        match download_r {
+            MaybeDeletedIndexPart::IndexPart(index_part) => {
+                assert_eq!(&index_part, expected);
+            }
+            MaybeDeletedIndexPart::Deleted(_index_part) => panic!("Test doesn't set deleted_at"),
+        }
+    }
+
+    #[tokio::test]
+    async fn index_part_download_simple() -> anyhow::Result<()> {
+        let test_state = TestSetup::new("index_part_download_simple").await.unwrap();
+        let span = test_state.span();
+        let _guard = span.enter();
+
+        // Simple case: we are in generation N, load the index from generation N - 1
+        let generation_n = 5;
+        let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
+
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn index_part_download_ordering() -> anyhow::Result<()> {
+        let test_state = TestSetup::new("index_part_download_ordering")
+            .await
+            .unwrap();
+
+        let span = test_state.span();
+        let _guard = span.enter();
+
+        // A generation-less IndexPart exists in the bucket, we should find it
+        let generation_n = 5;
+        let injected_none = inject_index_part(&test_state, Generation::none()).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
+
+        // If a more recent-than-none generation exists, we should prefer to load that
+        let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
+
+        // If a more-recent-than-me generation exists, we should ignore it.
+        let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
+        assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
+
+        // If a directly previous generation exists, _and_ an index exists in my own
+        // generation, I should prefer my own generation.
+        let _injected_prev =
+            inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
+        let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
+        assert_got_index_part(
+            &test_state,
+            Generation::new(generation_n),
+            &injected_current,
+        )
+        .await;
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/delete.rs
+++ b/pageserver/src/tenant/remote_timeline_client/delete.rs
@@ -0,0 +1,34 @@
+//! Helper functions to delete files from remote storage with a RemoteStorage
+use anyhow::Context;
+use std::path::Path;
+use tracing::debug;
+
+use remote_storage::GenericRemoteStorage;
+
+use crate::{
+    config::PageServerConf,
+    tenant::{remote_timeline_client::remote_path, Generation},
+};
+
+pub(super) async fn delete_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    local_layer_path: &'a Path,
+    generation: Generation,
+) -> anyhow::Result<()> {
+    fail::fail_point!("before-delete-layer", |_| {
+        anyhow::bail!("failpoint before-delete-layer")
+    });
+    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
+
+    let path_to_delete = remote_path(conf, local_layer_path, generation)?;
+
+    // We don't want to print an error if the delete failed if the file has
+    // already been deleted. Thankfully, in this situation S3 already
+    // does not yield an error. While OS-provided local file system APIs do yield
+    // errors, we avoid them in the `LocalFs` wrapper.
+    storage
+        .delete(&path_to_delete)
+        .await
+        .with_context(|| format!("delete remote layer from storage at {path_to_delete:?}"))
+}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -19,12 +19,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
+use super::{
+    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES,
+};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -47,12 +50,7 @@ pub async fn download_layer_file<'a>(
        .timeline_path(&tenant_id, &timeline_id)
        .join(layer_file_name.file_name());

-    let remote_path = remote_layer_path(
-        &tenant_id,
-        &timeline_id,
-        layer_file_name,
-        layer_metadata.generation,
-    );
+    let remote_path = remote_layer_path(&tenant_id, &timeline_id, layer_file_name, layer_metadata);

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -69,33 +67,43 @@ pub async fn download_layer_file<'a>(
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            // TODO: this doesn't use the cached fd for some reason?
-            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
-                format!(
-                    "create a destination file for layer '{}'",
-                    temp_file_path.display()
-                )
-            })
-            .map_err(DownloadError::Other)?;
-            let mut download = storage.download(&remote_path).await.with_context(|| {
-                format!(
+            let mut destination_file = fs::File::create(&temp_file_path)
+                .await
+                .with_context(|| {
+                    format!(
+                        "create a destination file for layer '{}'",
+                        temp_file_path.display()
+                    )
+                })
+                .map_err(DownloadError::Other)?;
+            let mut download = storage
+                .download(&remote_path)
+                .await
+                .with_context(|| {
+                    format!(
                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
-                .await
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
-                .with_context(|| {
-                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
                })
                .map_err(DownloadError::Other)?;

-            Ok((destination_file, bytes_amount))
+            let bytes_amount = tokio::time::timeout(
+                MAX_DOWNLOAD_DURATION,
+                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+            )
+            .await
+            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+            .with_context(|| {
+                format!(
+                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
+                )
+            })
+            .map_err(DownloadError::Other)?;

+            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
-    ).await?;
+    )
+    .await?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -108,12 +116,7 @@ pub async fn download_layer_file<'a>(
    destination_file
        .flush()
        .await
-        .with_context(|| {
-            format!(
-                "failed to flush source file at {}",
-                temp_file_path.display()
-            )
-        })
+        .with_context(|| format!("flush source file at {}", temp_file_path.display()))
        .map_err(DownloadError::Other)?;

    let expected = layer_metadata.file_size();
@@ -144,17 +147,12 @@ pub async fn download_layer_file<'a>(

    fs::rename(&temp_file_path, &local_path)
        .await
-        .with_context(|| {
-            format!(
-                "Could not rename download layer file to {}",
-                local_path.display(),
-            )
-        })
+        .with_context(|| format!("rename download layer file to {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    crashsafe::fsync_async(&local_path)
        .await
-        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
+        .with_context(|| format!("fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;

    tracing::debug!("download complete: {}", local_path.display());
@@ -205,9 +203,9 @@ pub async fn list_remote_timelines(
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        let timeline_id: TimelineId = object_name.parse().with_context(|| {
-            format!("failed to parse object name into timeline id '{object_name}'")
-        })?;
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;

        // list_prefixes is assumed to return unique names. Ensure this here.
        // NB: it's safer to bail out than warn-log this because the pageserver
@@ -225,7 +223,6 @@ pub async fn list_remote_timelines(
 }

 async fn do_download_index_part(
-    local_path: &Path,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
@@ -234,83 +231,92 @@ async fn do_download_index_part(
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

    let index_part_bytes = download_retry(
-        || storage.download_all(&remote_path),
+        || async {
+            let mut index_part_download = storage.download(&remote_path).await?;
+
+            let mut index_part_bytes = Vec::new();
+            tokio::io::copy(
+                &mut index_part_download.download_stream,
+                &mut index_part_bytes,
+            )
+            .await
+            .with_context(|| format!("download index part at {remote_path:?}"))
+            .map_err(DownloadError::Other)?;
+            Ok(index_part_bytes)
+        },
        &format!("download {remote_path:?}"),
    )
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("Failed to deserialize index part file into file {local_path:?}"))
+        .with_context(|| format!("download index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
 }

+/// index_part.json objects are suffixed with a generation number, so we cannot
+/// directly GET the latest index part without doing some probing.
+///
+/// In this function we probe for the most recent index in a generation <= our current generation.
+/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+#[tracing::instrument(skip_all, fields(generation=?my_generation))]
 pub(super) async fn download_index_part(
-    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
 ) -> Result<IndexPart, DownloadError> {
-    let local_path = conf
-        .metadata_path(tenant_id, timeline_id)
-        .with_file_name(IndexPart::FILE_NAME);
+    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(&local_path, storage, tenant_id, timeline_id, my_generation)
-            .await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    }

-    let previous_gen = my_generation.previous();
-    let r_previous =
-        do_download_index_part(&local_path, storage, tenant_id, timeline_id, previous_gen).await;
-
-    match r_previous {
+    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
+    // index in our generation.
+    //
+    // This is an optimization to avoid doing the listing for the general case below.
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
+    match res {
        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation {previous_gen}");
+            tracing::debug!(
+                "Found index_part from current generation (this is a stale attachment)"
+            );
            return Ok(index_part);
        }
-        Err(e) => {
-            if matches!(e, DownloadError::NotFound) {
-                tracing::debug!("No index_part found from previous generation {previous_gen}, falling back to listing");
-            } else {
-                return Err(e);
-            }
-        }
+        Err(DownloadError::NotFound) => {}
+        Err(e) => return Err(e),
    };

-    /// Given the key of an index, parse out the generation part of the name
-    fn parse_generation(path: RemotePath) -> Option<Generation> {
-        let path = path.take();
-        let file_name = match path.file_name() {
-            Some(f) => f,
-            None => {
-                // Unexpected: we should be seeing index_part.json paths only
-                tracing::warn!("Malformed index key {0}", path.display());
-                return None;
-            }
-        };
-
-        let file_name_str = match file_name.to_str() {
-            Some(s) => s,
-            None => {
-                tracing::warn!("Malformed index key {0}", path.display());
-                return None;
-            }
-        };
-
-        match file_name_str.split_once("-") {
-            Some((_, gen_suffix)) => u32::from_str_radix(gen_suffix, 16)
-                .map(|g| Generation::new(g))
-                .ok(),
-            None => None,
+    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
+    // and index part.  We may safely start from this index without doing a listing, because:
+    //  - We checked for current generation case above
+    //  - generations > my_generation are to be ignored
+    //  - any other indices that exist would have an older generation than `previous_gen`, and
+    //    we want to find the most recent index from a previous generation.
+    //
+    // This is an optimization to avoid doing the listing for the general case below.
+    let res =
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
+    match res {
+        Ok(index_part) => {
+            tracing::debug!("Found index_part from previous generation");
+            return Ok(index_part);
+        }
+        Err(DownloadError::NotFound) => {
+            tracing::debug!(
+                "No index_part found from previous generation, falling back to listing"
+            );
+        }
+        Err(e) => {
+            return Err(e);
        }
    }

-    // Fallback: we did not find an index_part.json from the previous generation, so
-    // we will list all the index_part objects and pick the most recent.
+    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
+    // objects, and select the highest one with a generation <= my_generation.
    let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
    let indices = backoff::retry(
        || async { storage.list_files(Some(&index_prefix)).await },
@@ -324,38 +330,26 @@ pub(super) async fn download_index_part(
        }),
    )
    .await
-    .map_err(|e| DownloadError::Other(e))?;
+    .map_err(DownloadError::Other)?;

-    let mut generations: Vec<_> = indices
+    // General case logic for which index to use: the latest index whose generation
+    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+    let max_previous_generation = indices
        .into_iter()
-        .filter_map(|k| parse_generation(k))
+        .filter_map(parse_remote_index_path)
        .filter(|g| g <= &my_generation)
-        .collect();
+        .max();

-    generations.sort();
-    match generations.last() {
+    match max_previous_generation {
        Some(g) => {
-            tracing::debug!("Found index_part in generation {g} (my generation {my_generation})");
-            do_download_index_part(&local_path, storage, tenant_id, timeline_id, *g).await
+            tracing::debug!("Found index_part in generation {g:?}");
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
-            // This is not an error: the timeline may be newly created, or we may be
-            // upgrading and have no historical index_part with a generation suffix.
-            // Fall back to trying to load the un-suffixed index_part.json.
-            tracing::info!(
-                "No index_part.json-* found when loading {}/{} in generation {}",
-                tenant_id,
-                timeline_id,
-                my_generation
-            );
-            return do_download_index_part(
-                &local_path,
-                storage,
-                tenant_id,
-                timeline_id,
-                Generation::none(),
-            )
-            .await;
+            // Migration from legacy pre-generation state: we have a generation but no prior
+            // attached pageservers did.  Try to load from a no-generation path.
+            tracing::info!("No index_part.json* found");
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
        }
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -2,7 +2,7 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
@@ -69,10 +69,6 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
-    #[serde(default, skip_deserializing)]
-    timeline_layers: HashSet<LayerFileName>,
-
    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
@@ -98,7 +94,12 @@ impl IndexPart {
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
    ///      is always generated from the keys of `layer_metadata`)
-    const LATEST_VERSION: usize = 3;
+    /// - 4: timeline_layers is fully removed.
+    const LATEST_VERSION: usize = 4;
+
+    // Versions we may see when reading from a bucket.
+    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];
+
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -106,24 +107,30 @@ impl IndexPart {
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
    ) -> Self {
-        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
-        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
-
-        for (remote_name, metadata) in &layers_and_metadata {
-            timeline_layers.insert(remote_name.to_owned());
-            let metadata = IndexLayerMetadata::from(metadata);
-            layer_metadata.insert(remote_name.to_owned(), metadata);
-        }
+        // Transform LayerFileMetadata into IndexLayerMetadata
+        let layer_metadata = layers_and_metadata
+            .into_iter()
+            .map(|(k, v)| (k, IndexLayerMetadata::from(v)))
+            .collect();

        Self {
            version: Self::LATEST_VERSION,
-            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
            metadata,
            deleted_at: None,
        }
    }
+
+    pub fn get_version(&self) -> usize {
+        self.version
+    }
+
+    /// If you want this under normal operations, read it from self.metadata:
+    /// this method is just for the scrubber to use when validating an index.
+    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+        self.disk_consistent_lsn
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -144,15 +151,15 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexLayerMetadata {
-    pub(super) file_size: u64,
+    pub file_size: u64,

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
    pub(super) generation: Generation,
 }

-impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &'_ LayerFileMetadata) -> Self {
+impl From<LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: LayerFileMetadata) -> Self {
        IndexLayerMetadata {
            file_size: other.file_size,
            generation: other.generation,
@@ -180,7 +187,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -219,7 +225,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -259,7 +264,6 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -294,7 +298,6 @@ mod tests {

        let expected = IndexPart {
            version: 1,
-            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[
@@ -327,4 +330,41 @@ mod tests {

        assert_eq!(empty_layers_parsed, expected);
    }
+
+    #[test]
+    fn v4_indexpart_is_parsed() {
+        let example = r#"{
+            "version":4,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 4,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -31,8 +31,8 @@ pub(super) async fn upload_index_part<'a>(
        bail!("failpoint before-upload-index")
    });

-    let index_part_bytes = serde_json::to_vec(&index_part)
-        .context("Failed to serialize index part file into bytes")?;
+    let index_part_bytes =
+        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

@@ -40,7 +40,7 @@ pub(super) async fn upload_index_part<'a>(
    storage
        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
        .await
-        .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -58,7 +58,7 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

-    let storage_path = remote_path(conf, source_path, Some(generation))?;
+    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
@@ -71,16 +71,15 @@ pub(super) async fn upload_timeline_layer<'a>(
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
-        Err(e) => Err(e)
-            .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?,
+        Err(e) => {
+            Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
+        }
    };

    let fs_size = source_file
        .metadata()
        .await
-        .with_context(|| {
-            format!("Failed to get the source file metadata for layer {source_path:?}")
-        })?
+        .with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
        .len();

    let metadata_size = known_metadata.file_size();
@@ -88,19 +87,13 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
    }

-    let fs_size = usize::try_from(fs_size).with_context(|| {
-        format!("File {source_path:?} size {fs_size} could not be converted to usize")
-    })?;
+    let fs_size = usize::try_from(fs_size)
+        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;

    storage
        .upload(source_file, fs_size, &storage_path, None)
        .await
-        .with_context(|| {
-            format!(
-                "Failed to upload a layer from local path '{}'",
-                source_path.display()
-            )
-        })?;
+        .with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;

    Ok(())
 }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -45,8 +45,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::{BufWriter, Write};
-use std::io::{Seek, SeekFrom};
+use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -219,7 +218,7 @@ pub struct DeltaLayerInner {
    index_root_blk: u32,

    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    file: FileBlockReader,
 }

 impl AsRef<DeltaLayerInner> for DeltaLayerInner {
@@ -583,14 +582,14 @@ struct DeltaLayerWriterInner {

    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

-    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
+    blob_writer: BlobWriter<true>,
 }

 impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
-    fn new(
+    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -605,11 +604,10 @@ impl DeltaLayerWriterInner {
        // FIXME: throw an error instead?
        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);

-        let mut file = VirtualFile::create(&path)?;
+        let mut file = VirtualFile::create(&path).await?;
        // make room for the header block
-        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
-        let buf_writer = BufWriter::new(file);
-        let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);
+        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -632,11 +630,12 @@ impl DeltaLayerWriterInner {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
+            .await
    }

-    fn put_value_bytes(
+    async fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -645,7 +644,7 @@ impl DeltaLayerWriterInner {
    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);

-        let off = self.blob_writer.write_blob(val)?;
+        let off = self.blob_writer.write_blob(val).await?;

        let blob_ref = BlobRef::new(off, will_init);

@@ -662,18 +661,18 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        let buf_writer = self.blob_writer.into_inner();
-        let mut file = buf_writer.into_inner()?;
+        let mut file = self.blob_writer.into_inner().await?;

        // Write out the index
        let (index_root_blk, block_buf) = self.tree.finish()?;
-        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
+            .await?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref())?;
+            file.write_all(buf.as_ref()).await?;
        }
        assert!(self.lsn_range.start < self.lsn_range.end);
        // Fill in the summary on blk 0
@@ -687,11 +686,22 @@ impl DeltaLayerWriterInner {
            index_start_blk,
            index_root_blk,
        };
-        file.seek(SeekFrom::Start(0))?;
-        Summary::ser_into(&summary, &mut file)?;
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&summary, &mut buf)?;
+        if buf.spilled() {
+            // This is bad as we only have one free block for the summary
+            warn!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;

        let metadata = file
            .metadata()
+            .await
            .context("get file metadata to determine size")?;

        // 5GB limit for objects without multipart upload (which we don't want to use)
@@ -722,7 +732,7 @@ impl DeltaLayerWriterInner {
        };

        // fsync the file
-        file.sync_all()?;
+        file.sync_all().await?;
        // Rename the file to its final name
        //
        // Note: This overwrites any existing file. There shouldn't be any.
@@ -774,7 +784,7 @@ impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
-    pub fn new(
+    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -782,13 +792,10 @@ impl DeltaLayerWriter {
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
-            inner: Some(DeltaLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_start,
-                lsn_range,
-            )?),
+            inner: Some(
+                DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
+                    .await?,
+            ),
        })
    }

@@ -797,11 +804,11 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val)
+    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
    }

-    pub fn put_value_bytes(
+    pub async fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -812,6 +819,7 @@ impl DeltaLayerWriter {
            .as_mut()
            .unwrap()
            .put_value_bytes(key, lsn, val, will_init)
+            .await
    }

    pub fn size(&self) -> u64 {
@@ -821,21 +829,18 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        self.inner.take().unwrap().finish(key_end)
+    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end).await
    }
 }

 impl Drop for DeltaLayerWriter {
    fn drop(&mut self) {
        if let Some(inner) = self.inner.take() {
-            match inner.blob_writer.into_inner().into_inner() {
-                Ok(vfile) => vfile.remove(),
-                Err(err) => warn!(
-                    "error while flushing buffer of image layer temporary file: {}",
-                    err
-                ),
-            }
+            // We want to remove the virtual file here, so it's fine to not
+            // having completely flushed unwritten data.
+            let vfile = inner.blob_writer.into_inner_no_flush();
+            vfile.remove();
        }
    }
 }
@@ -846,6 +851,7 @@ impl DeltaLayerInner {
        summary: Option<Summary>,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -212,7 +212,7 @@ pub enum LayerFileName {
 }

 impl LayerFileName {
-    pub(crate) fn file_name(&self) -> String {
+    pub fn file_name(&self) -> String {
        self.to_string()
    }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
+use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -42,8 +42,7 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::Write;
-use std::io::{Seek, SeekFrom};
+use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
@@ -155,7 +154,7 @@ pub struct ImageLayerInner {
    lsn: Lsn,

    /// Reader object for reading blocks from the file.
-    file: FileBlockReader<VirtualFile>,
+    file: FileBlockReader,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -439,6 +438,7 @@ impl ImageLayerInner {
        summary: Option<Summary>,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
+            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0).await?;
@@ -511,7 +511,7 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    blob_writer: WriteBlobWriter<VirtualFile>,
+    blob_writer: BlobWriter<false>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

@@ -519,7 +519,7 @@ impl ImageLayerWriterInner {
    ///
    /// Start building a new image layer.
    ///
-    fn new(
+    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -541,10 +541,11 @@ impl ImageLayerWriterInner {
        let mut file = VirtualFile::open_with_options(
            &path,
            std::fs::OpenOptions::new().write(true).create_new(true),
-        )?;
+        )
+        .await?;
        // make room for the header block
-        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
-        let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
+        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -569,9 +570,9 @@ impl ImageLayerWriterInner {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+    async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
        ensure!(self.key_range.contains(&key));
-        let off = self.blob_writer.write_blob(img)?;
+        let off = self.blob_writer.write_blob(img).await?;

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -583,17 +584,18 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    fn finish(self) -> anyhow::Result<ImageLayer> {
+    async fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

        let mut file = self.blob_writer.into_inner();

        // Write out the index
-        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
+            .await?;
        let (index_root_blk, block_buf) = self.tree.finish()?;
        for buf in block_buf.blocks {
-            file.write_all(buf.as_ref())?;
+            file.write_all(buf.as_ref()).await?;
        }

        // Fill in the summary on blk 0
@@ -607,11 +609,22 @@ impl ImageLayerWriterInner {
            index_start_blk,
            index_root_blk,
        };
-        file.seek(SeekFrom::Start(0))?;
-        Summary::ser_into(&summary, &mut file)?;
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&summary, &mut buf)?;
+        if buf.spilled() {
+            // This is bad as we only have one free block for the summary
+            warn!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;

        let metadata = file
            .metadata()
+            .await
            .context("get metadata to determine file size")?;

        let desc = PersistentLayerDesc::new_img(
@@ -634,7 +647,7 @@ impl ImageLayerWriterInner {
        };

        // fsync the file
-        file.sync_all()?;
+        file.sync_all().await?;

        // Rename the file to its final name
        //
@@ -687,7 +700,7 @@ impl ImageLayerWriter {
    ///
    /// Start building a new image layer.
    ///
-    pub fn new(
+    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -695,13 +708,9 @@ impl ImageLayerWriter {
        lsn: Lsn,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
-            inner: Some(ImageLayerWriterInner::new(
-                conf,
-                timeline_id,
-                tenant_id,
-                key_range,
-                lsn,
-            )?),
+            inner: Some(
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
+            ),
        })
    }

@@ -710,15 +719,15 @@ impl ImageLayerWriter {
    ///
    /// The page versions must be appended in blknum order.
    ///
-    pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_image(key, img)
+    pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_image(key, img).await
    }

    ///
    /// Finish writing the image layer.
    ///
-    pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
-        self.inner.take().unwrap().finish()
+    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish().await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -236,7 +236,7 @@ impl InMemoryLayer {
    ///
    /// Create a new, empty, in-memory layer
    ///
-    pub fn create(
+    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_id: TenantId,
@@ -244,7 +244,7 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

        Ok(InMemoryLayer {
            conf,
@@ -333,7 +333,8 @@ impl InMemoryLayer {
            self.tenant_id,
            Key::MIN,
            self.start_lsn..end_lsn,
-        )?;
+        )
+        .await?;

        let mut buf = Vec::new();

@@ -348,11 +349,13 @@ impl InMemoryLayer {
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
+                delta_layer_writer
+                    .put_value_bytes(key, *lsn, &buf, will_init)
+                    .await?;
            }
        }

-        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -102,6 +102,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            let started_at = Instant::now();

            let sleep_duration = if period == Duration::ZERO {
+                #[cfg(not(feature = "testing"))]
                info!("automatic compaction is disabled");
                // check again in 10 seconds, in case it's been enabled again.
                Duration::from_secs(10)
@@ -166,6 +167,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let gc_horizon = tenant.get_gc_horizon();
            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
+                #[cfg(not(feature = "testing"))]
                info!("automatic GC is disabled");
                // check again in 10 seconds, in case it's been enabled again.
                Duration::from_secs(10)
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,7 +38,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -91,6 +90,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::debug_assert_current_span_has_tenant_and_timeline_id;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -143,7 +143,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: Option<RemoteTimelineClient>,
-    pub deletion_queue_client: Option<DeletionQueueClient>,
 }

 pub struct Timeline {
@@ -155,7 +154,8 @@ pub struct Timeline {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,

-    // The generation of the tenant that instantiated us: this is used for safety when writing remote objects
+    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
+    /// Never changes for the lifetime of this [`Timeline`] object.
    generation: Generation,

    pub pg_version: u32,
@@ -201,9 +201,6 @@ pub struct Timeline {
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
    pub remote_client: Option<Arc<RemoteTimelineClient>>,

-    /// Deletion queue: a global queue, separate to the remote storage queue's
-    deletion_queue_client: Option<Arc<DeletionQueueClient>>,
-
    // What page versions do we hold in the repository? If we get a
    // request > last_record_lsn, we need to wait until we receive all
    // the WAL up to the request. The SeqWait provides functions for
@@ -937,6 +934,48 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

+    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
+    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // prevent writes to the InMemoryLayer
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::WalReceiverManager),
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // now all writers to InMemory layer are gone, do the final flush if requested
+        if freeze_and_flush {
+            match self.freeze_and_flush().await {
+                Ok(()) => {}
+                Err(e) => {
+                    warn!("failed to freeze and flush: {e:#}");
+                    return; // TODO: should probably drain remote timeline client anyways?
+                }
+            }
+
+            // drain the upload queue
+            let res = if let Some(client) = self.remote_client.as_ref() {
+                // if we did not wait for completion here, it might be our shutdown process
+                // didn't wait for remote uploads to complete at all, as new tasks can forever
+                // be spawned.
+                //
+                // what is problematic is the shutting down of RemoteTimelineClient, because
+                // obviously it does not make sense to stop while we wait for it, but what
+                // about corner cases like s3 suddenly hanging up?
+                client.wait_completion().await
+            } else {
+                Ok(())
+            };
+
+            if let Err(e) = res {
+                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            }
+        }
+    }
+
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
@@ -1271,18 +1310,6 @@ impl Timeline {

        Ok(())
    }
-
-    async fn delete_all_remote(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            if let Some(deletion_queue_client) = &self.deletion_queue_client {
-                remote_client.delete_all(deletion_queue_client).await
-            } else {
-                Ok(())
-            }
-        } else {
-            Ok(())
-        }
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1437,7 +1464,6 @@ impl Timeline {
                walreceiver: Mutex::new(None),

                remote_client: resources.remote_client.map(Arc::new),
-                deletion_queue_client: resources.deletion_queue_client.map(Arc::new),

                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                last_record_lsn: SeqWait::new(RecordLsn {
@@ -1698,11 +1724,18 @@ impl Timeline {
                for (name, decision) in decided {
                    let decision = match decision {
                        Ok(UseRemote { local, remote }) => {
-                            path.push(name.file_name());
-                            init::cleanup_local_file_for_remote(&path, &local, &remote)?;
-                            path.pop();
-
-                            UseRemote { local, remote }
+                            // Remote is authoritative, but we may still choose to retain
+                            // the local file if the contents appear to match
+                            if local.file_size() == remote.file_size() {
+                                // Use the local file, but take the remote metadata so that we pick up
+                                // the correct generation.
+                                UseLocal(remote)
+                            } else {
+                                path.push(name.file_name());
+                                init::cleanup_local_file_for_remote(&path, &local, &remote)?;
+                                path.pop();
+                                UseRemote { local, remote }
+                            }
                        }
                        Ok(decision) => decision,
                        Err(FutureLayer { local }) => {
@@ -1781,15 +1814,11 @@ impl Timeline {
        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);

        if let Some(rtc) = self.remote_client.as_ref() {
-            // Deletion queue client is always Some if remote_client is Some
-            let deletion_queue_client = self.deletion_queue_client.as_ref().unwrap();
-
            let (needs_upload, needs_cleanup) = to_sync;
            for (layer, m) in needs_upload {
                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
            }
-            rtc.schedule_layer_file_deletion(&needs_cleanup, deletion_queue_client)
-                .await?;
+            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
@@ -2515,13 +2544,15 @@ impl Timeline {
    ///
    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
        let mut guard = self.layers.write().await;
-        let layer = guard.get_layer_for_write(
-            lsn,
-            self.get_last_record_lsn(),
-            self.conf,
-            self.timeline_id,
-            self.tenant_id,
-        )?;
+        let layer = guard
+            .get_layer_for_write(
+                lsn,
+                self.get_last_record_lsn(),
+                self.conf,
+                self.timeline_id,
+                self.tenant_id,
+            )
+            .await?;
        Ok(layer)
    }

@@ -2756,6 +2787,7 @@ impl Timeline {
        if disk_consistent_lsn != old_disk_consistent_lsn {
            assert!(disk_consistent_lsn > old_disk_consistent_lsn);
            self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)
+                .await
                .context("update_metadata_file")?;
            // Also update the in-memory copy
            self.disk_consistent_lsn.store(disk_consistent_lsn);
@@ -2764,7 +2796,7 @@ impl Timeline {
    }

    /// Update metadata file
-    fn update_metadata_file(
+    async fn update_metadata_file(
        &self,
        disk_consistent_lsn: Lsn,
        layer_paths_to_upload: HashMap<LayerFileName, LayerFileMetadata>,
@@ -2805,14 +2837,9 @@ impl Timeline {
            x.unwrap()
        ));

-        save_metadata(
-            self.conf,
-            &self.tenant_id,
-            &self.timeline_id,
-            &metadata,
-            false,
-        )
-        .context("save_metadata")?;
+        save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
+            .await
+            .context("save_metadata")?;

        if let Some(remote_client) = &self.remote_client {
            for (path, layer_metadata) in layer_paths_to_upload {
@@ -3015,7 +3042,8 @@ impl Timeline {
                    self.tenant_id,
                    &img_range,
                    lsn,
-                )?;
+                )
+                .await?;

                fail_point!("image-layer-writer-fail-before-finish", |_| {
                    Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -3051,11 +3079,11 @@ impl Timeline {
                                }
                            }
                        };
-                        image_layer_writer.put_image(key, &img)?;
+                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
                }
-                let image_layer = image_layer_writer.finish()?;
+                let image_layer = image_layer_writer.finish().await?;
                image_layers.push(image_layer);
            }
        }
@@ -3600,7 +3628,11 @@ impl Timeline {
                    {
                        // ... if so, flush previous layer and prepare to write new one
                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            writer
+                                .take()
+                                .unwrap()
+                                .finish(prev_key.unwrap().next())
+                                .await?,
                        ));
                        writer = None;

@@ -3615,20 +3647,23 @@ impl Timeline {
            }
            if writer.is_none() {
                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
+                writer = Some(
+                    DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
+                        if dup_end_lsn.is_valid() {
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
+                        } else {
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )
+                    .await?,
+                );
            }

            fail_point!("delta-layer-writer-fail-before-finish", |_| {
@@ -3637,11 +3672,11 @@ impl Timeline {
                )))
            });

-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
+            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
+            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next()).await?));
        }

        // Sync layers
@@ -3830,13 +3865,7 @@ impl Timeline {

        // Also schedule the deletions in remote storage
        if let Some(remote_client) = &self.remote_client {
-            let deletion_queue = self
-                .deletion_queue_client
-                .as_ref()
-                .ok_or_else(|| anyhow::anyhow!("Remote storage enabled without deletion queue"))?;
-            remote_client
-                .schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
-                .await?;
+            remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
        }

        Ok(())
@@ -4149,7 +4178,8 @@ impl Timeline {
        if !layers_to_remove.is_empty() {
            // Persist the new GC cutoff value in the metadata file, before
            // we actually remove anything.
-            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())
+                .await?;

            // Actually delete the layers from disk and remove them from the map.
            // (couldn't do this in the loop above, because you cannot modify a collection
@@ -4170,15 +4200,7 @@ impl Timeline {
            }

            if let Some(remote_client) = &self.remote_client {
-                // Remote metadata upload was scheduled in `update_metadata_file`: wait
-                // for completion before scheduling any deletions.
-                remote_client.wait_completion().await?;
-                let deletion_queue = self.deletion_queue_client.as_ref().ok_or_else(|| {
-                    anyhow::anyhow!("Remote storage enabled without deletion queue")
-                })?;
-                remote_client
-                    .schedule_layer_file_deletion(&layer_names_to_delete, deletion_queue)
-                    .await?;
+                remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
            }

            apply.flush();
@@ -4768,7 +4790,6 @@ mod tests {

    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};

    use super::{EvictionError, Timeline};
@@ -4778,30 +4799,8 @@ mod tests {
        let harness =
            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();

-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-        let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
-
        let ctx = any_context();
-        let tenant = harness
-            .try_load(
-                &ctx,
-                Some(remote_storage),
-                Some(deletion_queue.new_client()),
-            )
-            .await
-            .unwrap();
+        let tenant = harness.try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
@@ -4851,30 +4850,8 @@ mod tests {
    async fn layer_eviction_aba_fails() {
        let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();

-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-        let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()), harness.conf);
-
        let ctx = any_context();
-        let tenant = harness
-            .try_load(
-                &ctx,
-                Some(remote_storage),
-                Some(deletion_queue.new_client()),
-            )
-            .await
-            .unwrap();
+        let tenant = harness.try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,6 @@ use utils::{

 use crate::{
    config::PageServerConf,
-    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -239,6 +238,15 @@ async fn delete_local_layer_files(
    Ok(())
 }

+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
+}
+
 // This function removs remaining traces of a timeline on disk.
 // Namely: metadata file, timeline directory, delete mark.
 // Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
@@ -399,7 +407,6 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
-        deletion_queue_client: Option<DeletionQueueClient>,
        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -409,10 +416,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                TimelineResources {
-                    remote_client,
-                    deletion_queue_client,
-                },
+                TimelineResources { remote_client },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
@@ -555,7 +559,7 @@ impl DeleteTimelineFlow {
    ) -> Result<(), DeleteTimelineError> {
        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;

-        timeline.delete_all_remote().await?;
+        delete_remote_layers_and_index(timeline).await?;

        pausable_failpoint!("in_progress_delete");

--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -147,11 +147,7 @@ pub(super) fn reconcile(
                Err(FutureLayer { local })
            } else {
                Ok(match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
-                        assert_eq!(local.generation, remote.generation);
-
-                        UseRemote { local, remote }
-                    }
+                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
                    (Some(x), Some(_)) => UseLocal(x),
                    (None, Some(x)) => Evicted(x),
                    (Some(x), None) => NeedsUpload(x),
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -87,7 +87,7 @@ impl LayerManager {

    /// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
    /// called within `get_layer_for_write`.
-    pub(crate) fn get_layer_for_write(
+    pub(crate) async fn get_layer_for_write(
        &mut self,
        lsn: Lsn,
        last_record_lsn: Lsn,
@@ -129,7 +129,7 @@ impl LayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn)?;
+            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,4 +1,7 @@
+use crate::metrics::RemoteOpFileKind;
+
 use super::storage_layer::LayerFileName;
+use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -60,6 +63,7 @@ pub(crate) struct UploadQueueInitialized {
    // Breakdown of different kinds of tasks currently in-progress
    pub(crate) num_inprogress_layer_uploads: usize,
    pub(crate) num_inprogress_metadata_uploads: usize,
+    pub(crate) num_inprogress_deletions: usize,

    /// Tasks that are currently in-progress. In-progress means that a tokio Task
    /// has been launched for it. An in-progress task can be busy uploading, but it can
@@ -117,6 +121,7 @@ impl UploadQueue {
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
        };
@@ -158,6 +163,7 @@ impl UploadQueue {
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
+            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
        };
@@ -195,6 +201,14 @@ pub(crate) struct UploadTask {
    pub(crate) op: UploadOp,
 }

+#[derive(Debug)]
+pub(crate) struct Delete {
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+    pub(crate) generation: Generation,
+}
+
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
@@ -203,6 +217,9 @@ pub(crate) enum UploadOp {
    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),

+    /// Delete a layer file
+    Delete(Delete),
+
    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
 }
@@ -213,14 +230,22 @@ impl std::fmt::Display for UploadOp {
            UploadOp::UploadLayer(path, metadata) => {
                write!(
                    f,
-                    "UploadLayer({}, size={:?})",
+                    "UploadLayer({}, size={:?}, gen={:?})",
                    path.file_name(),
-                    metadata.file_size()
+                    metadata.file_size(),
+                    metadata.generation,
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {}, gen: {:?})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete,
+                delete.generation
+            ),
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
    }
--- a/pageserver/src/test.log
+++ b/pageserver/src/test.log
@@ -1 +0,0 @@
-bash: scripts/pytest: No such file or directory
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,13 +11,15 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
+use crate::tenant::TENANTS_SEGMENT_NAME;
+use futures::Future;
 use once_cell::sync::OnceCell;
 use std::fs::{self, File, OpenOptions};
-use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
+use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::{RwLock, RwLockWriteGuard};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -109,7 +111,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -141,7 +143,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().unwrap();
+                slot_guard = slot.inner.write().await;
                index = next;
                break;
            }
@@ -172,19 +174,55 @@ impl OpenFiles {
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile: {0}")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile: {0}")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile: {0}")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile: {0}")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path: {0}")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir: {0}")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir: {0}")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
+}
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
+        match self {
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
+        }
+    }
+}
+
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
-    pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true))
+    pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true)).await
    }

    /// Create a new file for writing. If the file exists, it will be truncated.
    /// Like File::create.
-    pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(
            path,
            OpenOptions::new().write(true).create(true).truncate(true),
        )
+        .await
    }

    /// Open a file with given options.
@@ -192,7 +230,7 @@ impl VirtualFile {
    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
    /// they will be applied also when the file is subsequently re-opened, not only
    /// on the first time. Make sure that's sane!
-    pub fn open_with_options(
+    pub async fn open_with_options(
        path: &Path,
        open_options: &OpenOptions,
    ) -> Result<VirtualFile, std::io::Error> {
@@ -200,14 +238,14 @@ impl VirtualFile {
        let parts = path_str.split('/').collect::<Vec<&str>>();
        let tenant_id;
        let timeline_id;
-        if parts.len() > 5 && parts[parts.len() - 5] == "tenants" {
+        if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
            tenant_id = parts[parts.len() - 4].to_string();
            timeline_id = parts[parts.len() - 2].to_string();
        } else {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
        let file = STORAGE_IO_TIME
            .with_label_values(&["open"])
            .observe_closure_duration(|| open_options.open(path))?;
@@ -236,21 +274,79 @@ impl VirtualFile {
        Ok(vfile)
    }

-    /// Call File::sync_all() on the underlying File.
-    pub fn sync_all(&self) -> Result<(), Error> {
-        self.with_file("fsync", |file| file.sync_all())?
+    /// Writes a file to the specified `final_path` in a crash safe fasion
+    ///
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
+    pub async fn crashsafe_overwrite(
+        final_path: &Path,
+        tmp_path: &Path,
+        content: &[u8],
+    ) -> Result<(), CrashsafeOverwriteError> {
+        let Some(final_path_parent) = final_path.parent() else {
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
+        };
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
+        let mut file = Self::open_with_options(
+            tmp_path,
+            OpenOptions::new()
+                .write(true)
+                // Use `create_new` so that, if we race with ourselves or something else,
+                // we bail out instead of causing damage.
+                .create_new(true),
+        )
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
+        drop(file); // before the rename, that's important!
+                    // renames are atomic
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
+        // Only open final path parent dirfd now, so that this operation only
+        // ever holds one VirtualFile fd at a time.  That's important because
+        // the current `find_victim_slot` impl might pick the same slot for both
+        // VirtualFile., and it eventually does a blocking write lock instead of
+        // try_lock.
+        let final_parent_dirfd =
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
+        Ok(())
    }

-    pub fn metadata(&self) -> Result<fs::Metadata, Error> {
-        self.with_file("metadata", |file| file.metadata())?
+    /// Call File::sync_all() on the underlying File.
+    pub async fn sync_all(&self) -> Result<(), Error> {
+        self.with_file("fsync", |file| async move { file.sync_all() })
+            .await?
+    }
+
+    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
+        self.with_file("metadata", |file| async move { file.metadata() })
+            .await?
    }

    /// Helper function that looks up the underlying File for this VirtualFile,
    /// opening it and evicting some other File if necessary. It calls 'func'
    /// with the physical File.
-    fn with_file<F, R>(&self, op: &str, mut func: F) -> Result<R, Error>
+    async fn with_file<F, R, FR>(&self, _op: &str, func: F) -> Result<R, Error>
    where
-        F: FnMut(&File) -> R,
+        F: FnOnce(&File) -> FR,
+        FR: Future<Output = R>,
    {
        let open_files = get_open_files();

@@ -261,19 +357,17 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().unwrap();
+            let mut handle = *self.handle.read().await;
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().unwrap();
+                    let slot_guard = slot.inner.read().await;
                    if slot_guard.tag == handle.tag {
                        if let Some(file) = &slot_guard.file {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op])
-                                .observe_closure_duration(|| func(file)));
+                            return Ok(func(file).await);
                        }
                    }
                }
@@ -281,7 +375,7 @@ impl VirtualFile {
                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().unwrap();
+                let handle_guard = self.handle.write().await;

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -295,7 +389,7 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot();
+        let (handle, mut slot_guard) = open_files.find_victim_slot().await;

        // Open the physical file
        let file = STORAGE_IO_TIME
@@ -303,9 +397,7 @@ impl VirtualFile {
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
-        let result = STORAGE_IO_TIME
-            .with_label_values(&[op])
-            .observe_closure_duration(|| func(&file));
+        let result = func(&file).await;

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -321,60 +413,18 @@ impl VirtualFile {
        drop(self);
        std::fs::remove_file(path).expect("failed to remove the virtual file");
    }
-}

-impl Drop for VirtualFile {
-    /// If a VirtualFile is dropped, close the underlying file if it was open.
-    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
-
-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
-            STORAGE_IO_TIME
-                .with_label_values(&["close"])
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
-        }
-    }
-}
-
-impl Read for VirtualFile {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
-        let pos = self.pos;
-        let n = self.read_at(buf, pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
-impl Write for VirtualFile {
-    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
-        let pos = self.pos;
-        let n = self.write_at(buf, pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-
-    fn flush(&mut self) -> Result<(), std::io::Error> {
-        // flush is no-op for File (at least on unix), so we don't need to do
-        // anything here either.
-        Ok(())
-    }
-}
-
-impl Seek for VirtualFile {
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
        match pos {
            SeekFrom::Start(offset) => {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = self.with_file("seek", |mut file| file.seek(SeekFrom::End(offset)))??
+                self.pos = self
+                    .with_file("seek", |mut file| async move {
+                        file.seek(SeekFrom::End(offset))
+                    })
+                    .await??
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -392,11 +442,79 @@ impl Seek for VirtualFile {
        }
        Ok(self.pos)
    }
-}

-impl FileExt for VirtualFile {
-    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = self.with_file("read", |file| file.read_at(buf, offset))?;
+    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+    pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
+        while !buf.is_empty() {
+            match self.read_at(buf, offset).await {
+                Ok(0) => {
+                    return Err(Error::new(
+                        std::io::ErrorKind::UnexpectedEof,
+                        "failed to fill whole buffer",
+                    ))
+                }
+                Ok(n) => {
+                    buf = &mut buf[n..];
+                    offset += n as u64;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
+    pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+        while !buf.is_empty() {
+            match self.write_at(buf, offset).await {
+                Ok(0) => {
+                    return Err(Error::new(
+                        std::io::ErrorKind::WriteZero,
+                        "failed to write whole buffer",
+                    ));
+                }
+                Ok(n) => {
+                    buf = &buf[n..];
+                    offset += n as u64;
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
+        while !buf.is_empty() {
+            match self.write(buf).await {
+                Ok(0) => {
+                    return Err(Error::new(
+                        std::io::ErrorKind::WriteZero,
+                        "failed to write whole buffer",
+                    ));
+                }
+                Ok(n) => {
+                    buf = &buf[n..];
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+        Ok(())
+    }
+
+    async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+        let pos = self.pos;
+        let n = self.write_at(buf, pos).await?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
+        let result = self
+            .with_file("read", |file| async move { file.read_at(buf, offset) })
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -405,8 +523,10 @@ impl FileExt for VirtualFile {
        result
    }

-    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = self.with_file("write", |file| file.write_at(buf, offset))?;
+    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
+        let result = self
+            .with_file("write", |file| async move { file.write_at(buf, offset) })
+            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -416,6 +536,62 @@ impl FileExt for VirtualFile {
    }
 }

+#[cfg(test)]
+impl VirtualFile {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+        use crate::page_cache::PAGE_SZ;
+        let mut buf = [0; PAGE_SZ];
+        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
+            .await?;
+        Ok(std::sync::Arc::new(buf).into())
+    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+        loop {
+            let mut tmp = [0; 128];
+            match self.read_at(&mut tmp, self.pos).await {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&tmp[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+impl Drop for VirtualFile {
+    /// If a VirtualFile is dropped, close the underlying file if it was open.
+    fn drop(&mut self) {
+        let handle = self.handle.get_mut();
+
+        // We don't have async drop so we cannot wait for the lock here.
+        // Instead, do a best-effort attempt at closing the underlying
+        // file descriptor by using `try_write`.
+        // This best-effort attempt should be quite good though
+        // as we have `&mut self` access. In other words, if the slot
+        // is still occupied by our file, we should be the only ones
+        // accessing it (and if it has been reassigned since, we don't
+        // need to bother with dropping anyways).
+        let slot = &get_open_files().slots[handle.index];
+        let Ok(mut slot_guard) = slot.inner.try_write() else { return };
+
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            STORAGE_IO_TIME
+                .with_label_values(&["close"])
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        }
+    }
+}
+
 impl OpenFiles {
    fn new(num_slots: usize) -> OpenFiles {
        let mut slots = Box::new(Vec::with_capacity(num_slots));
@@ -469,33 +645,69 @@ mod tests {
    use rand::seq::SliceRandom;
    use rand::thread_rng;
    use rand::Rng;
+    use std::future::Future;
+    use std::io::Write;
    use std::sync::Arc;
-    use std::thread;

-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
-    where
-        FD: Read,
-    {
-        let mut buf = String::new();
-        vfile.read_to_string(&mut buf)?;
-        Ok(buf)
+    enum MaybeVirtualFile {
+        VirtualFile(VirtualFile),
+        File(File),
    }

-    // Helper function to slurp a portion of a file into a string
-    fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
-    where
-        FD: FileExt,
-    {
-        let mut buf = Vec::new();
-        buf.resize(len, 0);
-        vfile.read_exact_at(&mut buf, pos)?;
-        Ok(String::from_utf8(buf).unwrap())
+    impl MaybeVirtualFile {
+        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
+            match self {
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
+                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+            }
+        }
+        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
+            match self {
+                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
+                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+            }
+        }
+        async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+            match self {
+                MaybeVirtualFile::VirtualFile(file) => file.seek(pos).await,
+                MaybeVirtualFile::File(file) => file.seek(pos),
+            }
+        }
+        async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
+            match self {
+                MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
+                MaybeVirtualFile::File(file) => file.write_all(buf),
+            }
+        }
+
+        // Helper function to slurp contents of a file, starting at the current position,
+        // into a string
+        async fn read_string(&mut self) -> Result<String, Error> {
+            use std::io::Read;
+            let mut buf = String::new();
+            match self {
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let mut buf = Vec::new();
+                    file.read_to_end(&mut buf).await?;
+                    return Ok(String::from_utf8(buf).unwrap());
+                }
+                MaybeVirtualFile::File(file) => {
+                    file.read_to_string(&mut buf)?;
+                }
+            }
+            Ok(buf)
+        }
+
+        // Helper function to slurp a portion of a file into a string
+        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
+            let mut buf = vec![0; len];
+            self.read_exact_at(&mut buf, pos).await?;
+            Ok(String::from_utf8(buf).unwrap())
+        }
    }

-    #[test]
-    fn test_virtual_files() -> Result<(), Error> {
+    #[tokio::test]
+    async fn test_virtual_files() -> Result<(), Error> {
        // The real work is done in the test_files() helper function. This
        // allows us to run the same set of tests against a native File, and
        // VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -503,95 +715,106 @@ mod tests {
        // results with VirtualFiles as with native Files. (Except that with
        // native files, you will run out of file descriptors if the ulimit
        // is low enough.)
-        test_files("virtual_files", |path, open_options| {
-            VirtualFile::open_with_options(path, open_options)
+        test_files("virtual_files", |path, open_options| async move {
+            let vf = VirtualFile::open_with_options(&path, &open_options).await?;
+            Ok(MaybeVirtualFile::VirtualFile(vf))
        })
+        .await
    }

-    #[test]
-    fn test_physical_files() -> Result<(), Error> {
-        test_files("physical_files", |path, open_options| {
-            open_options.open(path)
+    #[tokio::test]
+    async fn test_physical_files() -> Result<(), Error> {
+        test_files("physical_files", |path, open_options| async move {
+            Ok(MaybeVirtualFile::File(open_options.open(path)?))
        })
+        .await
    }

-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
    where
-        FD: Read + Write + Seek + FileExt,
-        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
+        OF: Fn(PathBuf, OpenOptions) -> FT,
+        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
    {
        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
        std::fs::create_dir_all(&testdir)?;

        let path_a = testdir.join("file_a");
        let mut file_a = openfunc(
-            &path_a,
-            OpenOptions::new().write(true).create(true).truncate(true),
-        )?;
-        file_a.write_all(b"foobar")?;
+            path_a.clone(),
+            OpenOptions::new()
+                .write(true)
+                .create(true)
+                .truncate(true)
+                .to_owned(),
+        )
+        .await?;
+        file_a.write_all(b"foobar").await?;

        // cannot read from a file opened in write-only mode
-        assert!(read_string(&mut file_a).is_err());
+        let _ = file_a.read_string().await.unwrap_err();

        // Close the file and re-open for reading
-        let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
+        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;

        // cannot write to a file opened in read-only mode
-        assert!(file_a.write(b"bar").is_err());
+        let _ = file_a.write_all(b"bar").await.unwrap_err();

        // Try simple read
-        assert_eq!("foobar", read_string(&mut file_a)?);
+        assert_eq!("foobar", file_a.read_string().await?);

        // It's positioned at the EOF now.
-        assert_eq!("", read_string(&mut file_a)?);
+        assert_eq!("", file_a.read_string().await?);

        // Test seeks.
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
+        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
+        assert_eq!("oobar", file_a.read_string().await?);

-        assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
-        assert_eq!("ar", read_string(&mut file_a)?);
+        assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
+        assert_eq!("ar", file_a.read_string().await?);

-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
-        assert_eq!("bar", read_string(&mut file_a)?);
+        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
+        assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
+        assert_eq!("bar", file_a.read_string().await?);

-        assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
+        assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
+        assert_eq!("oobar", file_a.read_string().await?);

        // Test erroneous seeks to before byte 0
-        assert!(file_a.seek(SeekFrom::End(-7)).is_err());
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
+        file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
+        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
+        file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();

        // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", read_string(&mut file_a)?);
+        assert_eq!("oobar", file_a.read_string().await?);

        // Create another test file, and try FileExt functions on it.
        let path_b = testdir.join("file_b");
        let mut file_b = openfunc(
-            &path_b,
+            path_b.clone(),
            OpenOptions::new()
                .read(true)
                .write(true)
                .create(true)
-                .truncate(true),
-        )?;
-        file_b.write_all_at(b"BAR", 3)?;
-        file_b.write_all_at(b"FOO", 0)?;
+                .truncate(true)
+                .to_owned(),
+        )
+        .await?;
+        file_b.write_all_at(b"BAR", 3).await?;
+        file_b.write_all_at(b"FOO", 0).await?;

-        assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
+        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");

        // Open a lot of files, enough to cause some evictions. (Or to be precise,
        // open the same file many times. The effect is the same.)
        //
        // leave file_a positioned at offset 1 before we start
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
+        assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);

        let mut vfiles = Vec::new();
        for _ in 0..100 {
-            let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
-            assert_eq!("FOOBAR", read_string(&mut vfile)?);
+            let mut vfile =
+                openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
+            assert_eq!("FOOBAR", vfile.read_string().await?);
            vfiles.push(vfile);
        }

@@ -600,13 +823,13 @@ mod tests {

        // The underlying file descriptor for 'file_a' should be closed now. Try to read
        // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", read_string(&mut file_a)?);
+        assert_eq!("oobar", file_a.read_string().await?);

        // Check that all the other FDs still work too. Use them in random order for
        // good measure.
        vfiles.as_mut_slice().shuffle(&mut thread_rng());
        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
+            assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
        }

        Ok(())
@@ -615,8 +838,8 @@ mod tests {
    /// Test using VirtualFiles from many threads concurrently. This tests both using
    /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
    /// VirtualFile from multiple threads concurrently.
-    #[test]
-    fn test_vfile_concurrency() -> Result<(), Error> {
+    #[tokio::test]
+    async fn test_vfile_concurrency() -> Result<(), Error> {
        const SIZE: usize = 8 * 1024;
        const VIRTUAL_FILES: usize = 100;
        const THREADS: usize = 100;
@@ -635,35 +858,36 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
+            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
+                .await?;
            files.push(f);
        }
        let files = Arc::new(files);

        // Launch many threads, and use the virtual files concurrently in random order.
-        let mut threads = Vec::new();
-        for threadno in 0..THREADS {
-            let builder =
-                thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
-
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(THREADS)
+            .thread_name("test_vfile_concurrency thread")
+            .build()
+            .unwrap();
+        let mut hdls = Vec::new();
+        for _threadno in 0..THREADS {
            let files = files.clone();
-            let thread = builder
-                .spawn(move || {
-                    let mut buf = [0u8; SIZE];
-                    let mut rng = rand::thread_rng();
-                    for _ in 1..1000 {
-                        let f = &files[rng.gen_range(0..files.len())];
-                        f.read_exact_at(&mut buf, 0).unwrap();
-                        assert!(buf == SAMPLE);
-                    }
-                })
-                .unwrap();
-            threads.push(thread);
+            let hdl = rt.spawn(async move {
+                let mut buf = [0u8; SIZE];
+                let mut rng = rand::rngs::OsRng;
+                for _ in 1..1000 {
+                    let f = &files[rng.gen_range(0..files.len())];
+                    f.read_exact_at(&mut buf, 0).await.unwrap();
+                    assert!(buf == SAMPLE);
+                }
+            });
+            hdls.push(hdl);
        }
-
-        for thread in threads {
-            thread.join().unwrap();
+        for hdl in hdls {
+            hdl.await?;
        }
+        std::mem::forget(rt);

        Ok(())
    }
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -83,11 +83,12 @@ typedef struct FileCacheControl
 } FileCacheControl;

 static HTAB* lfc_hash;
-static int   lfc_desc;
+static int   lfc_desc = 0;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
 static int   lfc_free_space_watermark;
+static bool	lfc_disabled_by_failure = false;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
@@ -96,6 +97,8 @@ static shmem_request_hook_type prev_shmem_request_hook;
 #endif
 static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

+#define DISABLE_LFC() (lfc_max_size = 0, lfc_disabled_by_failure = true, lfc_desc = -1)
+
 void FileCacheMonitorMain(Datum main_arg);

 static void
@@ -168,7 +171,7 @@ lfc_change_limit_hook(int newval, void *extra)
 		return;

 	/* Open cache file if not done yet */
-	if (lfc_desc == 0)
+	if (lfc_desc <= 0)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
@@ -328,7 +331,7 @@ lfc_init(void)
 							   NULL,
 							   NULL);

-	if (lfc_max_size == 0)
+	if (lfc_max_size == 0 || lfc_disabled_by_failure)
 		return;

 	if (lfc_free_space_watermark != 0)
@@ -357,7 +360,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	bool found;
 	uint32 hash;

-	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
 		return false;

 	tag.rnode = rnode;
@@ -384,7 +387,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
 	uint32 hash;

-	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
 		return;

 	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
@@ -455,7 +458,7 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	bool result = true;
 	uint32 hash;

-	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
 		return false;

 	tag.rnode = rnode;
@@ -477,23 +480,25 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);

 	/* Open cache file if not done yet */
-	if (lfc_desc == 0)
+	if (lfc_desc <= 0)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+
 		if (lfc_desc < 0) {
 			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
-			lfc_size_limit = 0; /* disable file cache */
+			DISABLE_LFC();
 			result = false;
 		}
 	}

+
 	if (lfc_desc > 0)
 	{
 		rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 		if (rc != BLCKSZ)
 		{
 			elog(INFO, "Failed to read file cache: %m");
-			lfc_size_limit = 0; /* disable file cache */
+			DISABLE_LFC();
 			result = false;
 		}
 	}
@@ -523,7 +528,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
 	uint32 hash;

-	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
 		return;

 	tag.rnode = rnode;
@@ -570,12 +575,12 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);

 	/* Open cache file if not done yet */
-	if (lfc_desc == 0)
+	if (lfc_desc <= 0)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
 			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
-			lfc_size_limit = 0; /* disable file cache */
+			DISABLE_LFC(); /* disable file cache */
 		}
 	}
 	if (lfc_desc > 0)
@@ -584,7 +589,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		if (rc != BLCKSZ)
 		{
 			elog(WARNING, "Failed to write file cache: %m, disabling file cache");
-			lfc_size_limit = 0; /* disable file cache */
+			DISABLE_LFC(); /* disable file cache */
 		}
 	}
 	/* Place entry to the head of LRU list */
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -12,13 +12,19 @@ pub struct PasswordHackPayload {

 impl PasswordHackPayload {
    pub fn parse(bytes: &[u8]) -> Option<Self> {
-        // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.splitn_str(2, ";");
-        let endpoint = iter.next()?.to_str().ok()?;
-        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
-        let password = iter.next()?.to_owned();
+        // The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
+        let separators = [";", "$"];
+        for sep in separators {
+            if let Some((endpoint, password)) = bytes.split_once_str(sep) {
+                let endpoint = endpoint.to_str().ok()?;
+                return Some(Self {
+                    endpoint: parse_endpoint_param(endpoint)?.to_owned(),
+                    password: password.to_owned(),
+                });
+            }
+        }

-        Some(Self { endpoint, password })
+        None
    }
 }

@@ -91,4 +97,23 @@ mod tests {
        assert_eq!(payload.endpoint, "foobar");
        assert_eq!(payload.password, b"pass;word");
    }
+
+    #[test]
+    fn parse_password_hack_payload_dollar() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=$";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar$pass$word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass$word");
+    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -63,8 +63,8 @@ pub mod errors {
                        format!("{REQUEST_FAILED}: endpoint is disabled")
                    }
                    http::StatusCode::LOCKED => {
-                        // Status 423: project might be in maintenance mode (or bad state).
-                        format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
+                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
+                        format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
                    }
                    _ => REQUEST_FAILED.to_owned(),
                },
@@ -81,9 +81,15 @@ pub mod errors {
                // retry some temporary failures because the compute was in a bad state
                // (bad request can be returned when the endpoint was in transition)
                Self::Console {
-                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
+                    status: http::StatusCode::BAD_REQUEST,
                    ..
                } => true,
+                // locked can be returned when the endpoint was in transition
+                // or when quotas are exceeded. don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::LOCKED,
+                    ref text,
+                } => !text.contains("quota"),
                // retry server errors
                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use std::net::SocketAddr;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -16,12 +17,21 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
    endpoint: http::Endpoint,
    caches: &'static ApiCaches,
+    jwt: String,
 }

 impl Api {
    /// Construct an API object containing the auth parameters.
    pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
-        Self { endpoint, caches }
+        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
+            Ok(v) => v,
+            Err(_) => "".to_string(),
+        };
+        Self {
+            endpoint,
+            caches,
+            jwt,
+        }
    }

    pub fn url(&self) -> &str {
@@ -39,6 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -83,6 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -106,7 +118,7 @@ impl Api {
            // We'll set username and such later using the startup message.
            // TODO: add more type safety (in progress).
            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.

            let node = NodeInfo {
                config,
@@ -183,9 +195,9 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
    Err(ApiError::Console { status, text })
 }

-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.split_once(':')?;
-    Some((host, port.parse().ok()?))
+fn parse_host_port(input: &str) -> Option<(String, u16)> {
+    let parsed: SocketAddr = input.parse().ok()?;
+    Some((parsed.ip().to_string(), parsed.port()))
 }

 #[cfg(test)]
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -2,6 +2,7 @@ use crate::{
    cancellation::CancelMap,
    config::ProxyConfig,
    error::io_error,
+    protocol2::{ProxyProtocolAccept, WithClientIp},
    proxy::{handle_client, ClientMode},
 };
 use bytes::{Buf, Bytes};
@@ -292,6 +293,9 @@ pub async fn task_main(

    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
    let _ = addr_incoming.set_nodelay(true);
+    let addr_incoming = ProxyProtocolAccept {
+        incoming: addr_incoming,
+    };

    let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
        if let Err(err) = conn {
@@ -302,9 +306,11 @@ pub async fn task_main(
        }
    });

-    let make_svc =
-        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
-            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+    let make_svc = hyper::service::make_service_fn(
+        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
+            let (io, tls) = stream.get_ref();
+            let peer_addr = io.client_addr().unwrap_or(io.inner.remote_addr());
+            let sni_name = tls.server_name().map(|s| s.to_string());
            let conn_pool = conn_pool.clone();

            async move {
@@ -319,13 +325,15 @@ pub async fn task_main(
                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
-                                session = %session_id
+                                session = %session_id,
+                                %peer_addr,
                            ))
                            .await
                    }
                }))
            }
-        });
+        },
+    );

    hyper::Server::builder(accept::from_stream(tls_listener))
        .serve(make_svc)
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod http;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
+pub mod protocol2;
 pub mod proxy;
 pub mod sasl;
 pub mod scram;
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -0,0 +1,479 @@
+//! Proxy Protocol V2 implementation
+
+use std::{
+    future::poll_fn,
+    future::Future,
+    io,
+    net::SocketAddr,
+    pin::{pin, Pin},
+    task::{ready, Context, Poll},
+};
+
+use bytes::{Buf, BytesMut};
+use hyper::server::conn::{AddrIncoming, AddrStream};
+use pin_project_lite::pin_project;
+use tls_listener::AsyncAccept;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+
+pub struct ProxyProtocolAccept {
+    pub incoming: AddrIncoming,
+}
+
+pin_project! {
+    pub struct WithClientIp<T> {
+        #[pin]
+        pub inner: T,
+        buf: BytesMut,
+        tlv_bytes: u16,
+        state: ProxyParse,
+    }
+}
+
+#[derive(Clone, PartialEq, Debug)]
+enum ProxyParse {
+    NotStarted,
+
+    Finished(SocketAddr),
+    None,
+}
+
+impl<T: AsyncWrite> AsyncWrite for WithClientIp<T> {
+    #[inline]
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write(cx, buf)
+    }
+
+    #[inline]
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_flush(cx)
+    }
+
+    #[inline]
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
+        self.project().inner.poll_shutdown(cx)
+    }
+
+    #[inline]
+    fn poll_write_vectored(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        bufs: &[io::IoSlice<'_>],
+    ) -> Poll<Result<usize, io::Error>> {
+        self.project().inner.poll_write_vectored(cx, bufs)
+    }
+
+    #[inline]
+    fn is_write_vectored(&self) -> bool {
+        self.inner.is_write_vectored()
+    }
+}
+
+impl<T> WithClientIp<T> {
+    pub fn new(inner: T) -> Self {
+        WithClientIp {
+            inner,
+            buf: BytesMut::with_capacity(128),
+            tlv_bytes: 0,
+            state: ProxyParse::NotStarted,
+        }
+    }
+
+    pub fn client_addr(&self) -> Option<SocketAddr> {
+        match self.state {
+            ProxyParse::Finished(socket) => Some(socket),
+            _ => None,
+        }
+    }
+}
+
+impl<T: AsyncRead + Unpin> WithClientIp<T> {
+    pub async fn wait_for_addr(&mut self) -> io::Result<Option<SocketAddr>> {
+        match self.state {
+            ProxyParse::NotStarted => {
+                let mut pin = Pin::new(&mut *self);
+                let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?;
+                match addr {
+                    Some(addr) => self.state = ProxyParse::Finished(addr),
+                    None => self.state = ProxyParse::None,
+                }
+                Ok(addr)
+            }
+            ProxyParse::Finished(addr) => Ok(Some(addr)),
+            ProxyParse::None => Ok(None),
+        }
+    }
+}
+
+/// Proxy Protocol Version 2 Header
+const HEADER: [u8; 12] = [
+    0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
+];
+
+impl<T: AsyncRead> WithClientIp<T> {
+    /// implementation of <https://www.haproxy.org/download/2.4/doc/proxy-protocol.txt>
+    /// Version 2 (Binary Format)
+    fn poll_client_ip(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<io::Result<Option<SocketAddr>>> {
+        // The binary header format starts with a constant 12 bytes block containing the protocol signature :
+        //    \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A
+        while self.buf.len() < 16 {
+            let mut this = self.as_mut().project();
+            let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?;
+
+            // exit for bad header
+            let len = usize::min(self.buf.len(), HEADER.len());
+            if self.buf[..len] != HEADER[..len] {
+                return Poll::Ready(Ok(None));
+            }
+
+            // if no more bytes available then exit
+            if ready!(bytes_read) == 0 {
+                return Poll::Ready(Ok(None));
+            };
+        }
+
+        // The next byte (the 13th one) is the protocol version and command.
+        // The highest four bits contains the version. As of this specification, it must
+        // always be sent as \x2 and the receiver must only accept this value.
+        let vc = self.buf[12];
+        let version = vc >> 4;
+        let command = vc & 0b1111;
+        if version != 2 {
+            return Poll::Ready(Err(io::Error::new(
+                io::ErrorKind::Other,
+                "invalid proxy protocol version. expected version 2",
+            )));
+        }
+        match command {
+            // the connection was established on purpose by the proxy
+            // without being relayed. The connection endpoints are the sender and the
+            // receiver. Such connections exist when the proxy sends health-checks to the
+            // server. The receiver must accept this connection as valid and must use the
+            // real connection endpoints and discard the protocol block including the
+            // family which is ignored.
+            0 => {}
+            // the connection was established on behalf of another node,
+            // and reflects the original connection endpoints. The receiver must then use
+            // the information provided in the protocol block to get original the address.
+            1 => {}
+            // other values are unassigned and must not be emitted by senders. Receivers
+            // must drop connections presenting unexpected values here.
+            _ => {
+                return Poll::Ready(Err(io::Error::new(
+                    io::ErrorKind::Other,
+                    "invalid proxy protocol command. expected local (0) or proxy (1)",
+                )))
+            }
+        };
+
+        // The 14th byte contains the transport protocol and address family. The highest 4
+        // bits contain the address family, the lowest 4 bits contain the protocol.
+        let ft = self.buf[13];
+        let address_length = match ft {
+            // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
+            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+            // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
+            //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
+            0x11 | 0x12 => 12,
+            // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
+            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+            // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
+            //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
+            0x21 | 0x22 => 36,
+            // unspecified or unix stream. ignore the addresses
+            _ => 0,
+        };
+
+        // The 15th and 16th bytes is the address length in bytes in network endian order.
+        // It is used so that the receiver knows how many address bytes to skip even when
+        // it does not implement the presented protocol. Thus the length of the protocol
+        // header in bytes is always exactly 16 + this value. When a sender presents a
+        // LOCAL connection, it should not present any address so it sets this field to
+        // zero. Receivers MUST always consider this field to skip the appropriate number
+        // of bytes and must not assume zero is presented for LOCAL connections. When a
+        // receiver accepts an incoming connection showing an UNSPEC address family or
+        // protocol, it may or may not decide to log the address information if present.
+        let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap());
+        if remaining_length < address_length {
+            return Poll::Ready(Err(io::Error::new(
+                io::ErrorKind::Other,
+                "invalid proxy protocol length. not enough to fit requested IP addresses",
+            )));
+        }
+
+        while self.buf.len() < 16 + address_length as usize {
+            let mut this = self.as_mut().project();
+            if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 {
+                return Poll::Ready(Err(io::Error::new(
+                    io::ErrorKind::UnexpectedEof,
+                    "stream closed while waiting for proxy protocol addresses",
+                )));
+            }
+        }
+
+        let this = self.as_mut().project();
+
+        // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need
+        // discard the header we have parsed
+        this.buf.advance(16);
+
+        // Starting from the 17th byte, addresses are presented in network byte order.
+        // The address order is always the same :
+        //   - source layer 3 address in network byte order
+        //   - destination layer 3 address in network byte order
+        //   - source layer 4 address if any, in network byte order (port)
+        //   - destination layer 4 address if any, in network byte order (port)
+        let addresses = this.buf.split_to(address_length as usize);
+        let socket = match address_length {
+            12 => {
+                let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
+                let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+                Some(SocketAddr::from((src_addr, src_port)))
+            }
+            36 => {
+                let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
+                let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+                Some(SocketAddr::from((src_addr, src_port)))
+            }
+            _ => None,
+        };
+
+        *this.tlv_bytes = remaining_length - address_length;
+        self.as_mut().skip_tlv_inner();
+
+        Poll::Ready(Ok(socket))
+    }
+
+    #[cold]
+    fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let ip = ready!(self.as_mut().poll_client_ip(cx)?);
+        match ip {
+            Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x),
+            None => *self.as_mut().project().state = ProxyParse::None,
+        }
+        Poll::Ready(Ok(()))
+    }
+
+    #[cold]
+    fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let mut this = self.as_mut().project();
+        // we know that this.buf is empty
+        debug_assert_eq!(this.buf.len(), 0);
+
+        this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize);
+        ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?);
+        self.skip_tlv_inner();
+
+        Poll::Ready(Ok(()))
+    }
+
+    fn skip_tlv_inner(self: Pin<&mut Self>) {
+        let tlv_bytes_read = match u16::try_from(self.buf.len()) {
+            // we read more than u16::MAX therefore we must have read the full tlv_bytes
+            Err(_) => self.tlv_bytes,
+            // we might not have read the full tlv bytes yet
+            Ok(n) => u16::min(n, self.tlv_bytes),
+        };
+        let this = self.project();
+        *this.tlv_bytes -= tlv_bytes_read;
+        this.buf.advance(tlv_bytes_read as usize);
+    }
+}
+
+impl<T: AsyncRead> AsyncRead for WithClientIp<T> {
+    #[inline]
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        // I'm assuming these 3 comparisons will be easy to branch predict.
+        // especially with the cold attributes
+        // which should make this read wrapper almost invisible
+
+        if let ProxyParse::NotStarted = self.state {
+            ready!(self.as_mut().read_ip(cx)?);
+        }
+
+        while self.tlv_bytes > 0 {
+            ready!(self.as_mut().skip_tlv(cx)?)
+        }
+
+        let this = self.project();
+        if this.buf.is_empty() {
+            this.inner.poll_read(cx, buf)
+        } else {
+            // we know that tlv_bytes is 0
+            debug_assert_eq!(*this.tlv_bytes, 0);
+
+            let write = usize::min(this.buf.len(), buf.remaining());
+            let slice = this.buf.split_to(write).freeze();
+            buf.put_slice(&slice);
+
+            // reset the allocation so it can be freed
+            if this.buf.is_empty() {
+                *this.buf = BytesMut::new();
+            }
+
+            Poll::Ready(Ok(()))
+        }
+    }
+}
+
+impl AsyncAccept for ProxyProtocolAccept {
+    type Connection = WithClientIp<AddrStream>;
+
+    type Error = io::Error;
+
+    fn poll_accept(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Connection, Self::Error>>> {
+        let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
+        let Some(conn) = conn else {
+            return Poll::Ready(None);
+        };
+
+        Poll::Ready(Some(Ok(WithClientIp::new(conn))))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::pin::pin;
+
+    use tokio::io::AsyncReadExt;
+
+    use crate::protocol2::{ProxyParse, WithClientIp};
+
+    #[tokio::test]
+    async fn test_ipv4() {
+        let header = super::HEADER
+            // Proxy command, IPV4 | TCP
+            .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
+            // 12 + 3 bytes
+            .chain([0, 15].as_slice())
+            // src ip
+            .chain([127, 0, 0, 1].as_slice())
+            // dst ip
+            .chain([192, 168, 0, 1].as_slice())
+            // src port
+            .chain([255, 255].as_slice())
+            // dst port
+            .chain([1, 1].as_slice())
+            // TLV
+            .chain([1, 2, 3].as_slice());
+
+        let extra_data = [0x55; 256];
+
+        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+
+        assert_eq!(bytes, extra_data);
+        assert_eq!(
+            read.state,
+            ProxyParse::Finished(([127, 0, 0, 1], 65535).into())
+        );
+    }
+
+    #[tokio::test]
+    async fn test_ipv6() {
+        let header = super::HEADER
+            // Proxy command, IPV6 | UDP
+            .chain([(2 << 4) | 1, (2 << 4) | 2].as_slice())
+            // 36 + 3 bytes
+            .chain([0, 39].as_slice())
+            // src ip
+            .chain([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0].as_slice())
+            // dst ip
+            .chain([0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8].as_slice())
+            // src port
+            .chain([1, 1].as_slice())
+            // dst port
+            .chain([255, 255].as_slice())
+            // TLV
+            .chain([1, 2, 3].as_slice());
+
+        let extra_data = [0x55; 256];
+
+        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+
+        assert_eq!(bytes, extra_data);
+        assert_eq!(
+            read.state,
+            ProxyParse::Finished(
+                ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
+            )
+        );
+    }
+
+    #[tokio::test]
+    async fn test_invalid() {
+        let data = [0x55; 256];
+
+        let mut read = pin!(WithClientIp::new(data.as_slice()));
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+        assert_eq!(bytes, data);
+        assert_eq!(read.state, ProxyParse::None);
+    }
+
+    #[tokio::test]
+    async fn test_short() {
+        let data = [0x55; 10];
+
+        let mut read = pin!(WithClientIp::new(data.as_slice()));
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+        assert_eq!(bytes, data);
+        assert_eq!(read.state, ProxyParse::None);
+    }
+
+    #[tokio::test]
+    async fn test_large_tlv() {
+        let tlv = vec![0x55; 32768];
+        let len = (12 + tlv.len() as u16).to_be_bytes();
+
+        let header = super::HEADER
+            // Proxy command, Inet << 4 | Stream
+            .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
+            // 12 + 3 bytes
+            .chain(len.as_slice())
+            // src ip
+            .chain([55, 56, 57, 58].as_slice())
+            // dst ip
+            .chain([192, 168, 0, 1].as_slice())
+            // src port
+            .chain([255, 255].as_slice())
+            // dst port
+            .chain([1, 1].as_slice())
+            // TLV
+            .chain(tlv.as_slice());
+
+        let extra_data = [0xaa; 256];
+
+        let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice())));
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+
+        assert_eq!(bytes, extra_data);
+        assert_eq!(
+            read.state,
+            ProxyParse::Finished(([55, 56, 57, 58], 65535).into())
+        );
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -7,6 +7,7 @@ use crate::{
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    protocol2::WithClientIp,
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
@@ -100,7 +101,7 @@ pub async fn task_main(
    loop {
        tokio::select! {
            accept_result = listener.accept() => {
-                let (socket, peer_addr) = accept_result?;
+                let (socket, _) = accept_result?;

                let session_id = uuid::Uuid::new_v4();
                let cancel_map = Arc::clone(&cancel_map);
@@ -108,13 +109,19 @@ pub async fn task_main(
                    async move {
                        info!("accepted postgres client connection");

+                        let mut socket = WithClientIp::new(socket);
+                        if let Some(ip) = socket.wait_for_addr().await? {
+                            tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
+                        }
+
                        socket
+                            .inner
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
                    }
-                    .instrument(info_span!("handle_client", ?session_id, %peer_addr))
+                    .instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
                    .unwrap_or_else(move |e| {
                        // Acknowledge that the task has finished with an error.
                        error!(?session_id, "per-client task finished with an error: {e:#}");
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -137,6 +137,7 @@ async fn dummy_proxy(
    auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
    let cancel_map = CancelMap::default();
+    let client = WithClientIp::new(client);
    let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map)
        .await?
        .context("handshake failed")?;
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -141,7 +141,7 @@ impl<S> Stream<S> {
    pub fn sni_hostname(&self) -> Option<&str> {
        match self {
            Stream::Raw { .. } => None,
-            Stream::Tls { tls } => tls.get_ref().1.sni_hostname(),
+            Stream::Tls { tls } => tls.get_ref().1.server_name(),
        }
    }
 }
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -0,0 +1,41 @@
+[package]
+name = "s3_scrubber"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+aws-sdk-s3.workspace = true
+aws-smithy-http.workspace = true
+aws-types.workspace = true
+either.workspace = true
+tokio-rustls.workspace = true
+anyhow.workspace = true
+hex.workspace = true
+thiserror.workspace = true
+rand.workspace = true
+bytes.workspace = true
+bincode.workspace = true
+crc32c.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+serde_with.workspace = true
+workspace_hack.workspace = true
+utils.workspace = true
+async-stream.workspace = true
+tokio-stream.workspace = true
+futures-util.workspace = true
+itertools.workspace = true
+
+tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
+chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls", "json"] }
+aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }
+
+pageserver = {path="../pageserver"}
+
+tracing.workspace = true
+tracing-subscriber.workspace = true
+clap.workspace = true
+tracing-appender = "0.2"
+histogram = "0.7"
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -0,0 +1,93 @@
+# Neon S3 scrubber
+
+This tool directly accesses the S3 buckets used by the Neon `pageserver`
+and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.
+
+## Usage
+
+### Generic Parameters
+
+#### S3
+
+Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
+
+- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
+- `REGION`: A region where the bucket is located at.
+- `BUCKET`: Bucket name
+
+#### Console API
+
+_This section is only relevant if using a command that requires access to Neon's internal control plane_
+
+- `CLOUD_ADMIN_API_URL`: The URL base to use for checking tenant/timeline for existence via the Cloud API.  e.g. `https://<admin host>/admin`
+
+- `CLOUD_ADMIN_API_TOKEN`: The token to provide when querying the admin API. Get one on the corresponding console page, e.g. `https://<admin host>/app/settings/api-keys`
+
+### Commands
+
+#### `tidy`
+
+Iterate over S3 buckets for storage nodes, checking their contents and removing the data not present in the console. Node S3 data that's not removed is then further checked for discrepancies and, sometimes, validated.
+
+Unless the global `--delete` argument is provided, this command only dry-runs and logs
+what it would have deleted.
+
+```
+tidy --node-kind=<safekeeper|pageserver> [--depth=<tenant|timeline>] [--skip-validation]
+```
+
+- `--node-kind`: whether to inspect safekeeper or pageserver bucket prefix
+- `--depth`: whether to only search for deletable tenants, or also search for
+  deletable timelines within active tenants. Default: `tenant`
+- `--skip-validation`: skip additional post-deletion checks. Default: `false`
+
+For a selected S3 path, the tool lists the S3 bucket given for either tenants or both tenants and timelines — for every found entry, console API is queried: any deleted or missing in the API entity is scheduled for deletion from S3.
+
+If validation is enabled, only the non-deleted tenants' ones are checked.
+For pageserver, timelines' index_part.json on S3 is also checked for various discrepancies: no files are removed, even if there are "extra" S3 files not present in index_part.json: due to the way pageserver updates the remote storage, it's better to do such removals manually, stopping the corresponding tenant first.
+
+Command examples:
+
+`env SSO_ACCOUNT_ID=369495373322 REGION=eu-west-1 BUCKET=neon-dev-storage-eu-west-1 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=safekeeper`
+
+`env SSO_ACCOUNT_ID=369495373322 REGION=us-east-2 BUCKET=neon-staging-storage-us-east-2 CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- tidy --node-kind=pageserver --depth=timeline`
+
+When dry run stats look satisfying, use `-- --delete` before the `tidy` command to
+disable dry run and run the binary with deletion enabled.
+
+See these lines (and lines around) in the logs for the final stats:
+
+- `Finished listing the bucket for tenants`
+- `Finished active tenant and timeline validation`
+- `Total tenant deletion stats`
+- `Total timeline deletion stats`
+
+## Current implementation details
+
+- The tool does not have any peristent state currently: instead, it creates very verbose logs, with every S3 delete request logged, every tenant/timeline id check, etc.
+  Worse, any panic or early errored tasks might force the tool to exit without printing the final summary — all affected ids will still be in the logs though. The tool has retries inside it, so it's error-resistant up to some extent, and recent runs showed no traces of errors/panics.
+
+- Instead of checking non-deleted tenants' timelines instantly, the tool attempts to create separate tasks (futures) for that,
+  complicating the logic and slowing down the process, this should be fixed and done in one "task".
+
+- The tool does uses only publicly available remote resources (S3, console) and does not access pageserver/safekeeper nodes themselves.
+  Yet, its S3 set up should be prepared for running on any pageserver/safekeeper node, using node's S3 credentials, so the node API access logic could be implemented relatively simply on top.
+
+## Cleanup procedure:
+
+### Pageserver preparations
+
+If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers.
+
+First, we need to group pageservers by buckets, `https://<admin host>/admin/pageservers`` can be used for all env nodes, then `cat /storage/pageserver/data/pageserver.toml` on every node will show the bucket names and regions needed.
+
+Per bucket, for every pageserver id related, find deleted tenants:
+
+`curl -X POST "https://<admin_host>/admin/check_pageserver/{id}" -H "Accept: application/json" -H "Authorization: Bearer ${NEON_CLOUD_ADMIN_API_STAGING_KEY}" | jq`
+
+use `?check_timelines=true` to find deleted timelines, but the check runs a separate query on every alive tenant, so that could be long and time out for big pageservers.
+
+Note that some tenants/timelines could be marked as deleted in console, but console might continue querying the node later to fully remove the tenant/timeline: wait for some time before ensuring that the "extra" tenant/timeline is not going away by itself.
+
+When all IDs are collected, manually go to every pageserver and detach/delete the tenant/timeline.
+In future, the cleanup tool may access pageservers directly, but now it's only console and S3 it has access to.
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -0,0 +1,437 @@
+use std::collections::{hash_map, HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use tokio::task::JoinSet;
+use tracing::{error, info, info_span, warn, Instrument};
+
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectId};
+use crate::delete_batch_producer::DeleteProducerStats;
+use crate::{download_object_with_retries, list_objects_with_retries, RootTarget, MAX_RETRIES};
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::IndexPart;
+use utils::id::TenantTimelineId;
+
+pub async fn validate_pageserver_active_tenant_and_timelines(
+    s3_client: Arc<Client>,
+    s3_root: RootTarget,
+    admin_client: Arc<CloudAdminApiClient>,
+    batch_producer_stats: DeleteProducerStats,
+) -> anyhow::Result<BranchCheckStats> {
+    let Some(timeline_stats) = batch_producer_stats.timeline_stats else {
+        info!("No tenant-only checks, exiting");
+        return Ok(BranchCheckStats::default());
+    };
+
+    let s3_active_projects = batch_producer_stats
+        .tenant_stats
+        .active_entries
+        .into_iter()
+        .map(|project| (project.id.clone(), project))
+        .collect::<HashMap<_, _>>();
+    info!("Validating {} active tenants", s3_active_projects.len());
+
+    let mut s3_active_branches_per_project = HashMap::<ProjectId, Vec<BranchData>>::new();
+    let mut s3_blob_data = HashMap::<TenantTimelineId, S3TimelineBlobData>::new();
+    for active_branch in timeline_stats.active_entries {
+        let active_project_id = active_branch.project_id.clone();
+        let active_branch_id = active_branch.id.clone();
+        let active_timeline_id = active_branch.timeline_id;
+
+        s3_active_branches_per_project
+            .entry(active_project_id.clone())
+            .or_default()
+            .push(active_branch);
+
+        let Some(active_project) = s3_active_projects.get(&active_project_id) else {
+            error!("Branch {:?} for project {:?} has no such project in the active projects", active_branch_id, active_project_id);
+            continue;
+        };
+
+        let id = TenantTimelineId::new(active_project.tenant, active_timeline_id);
+        s3_blob_data.insert(
+            id,
+            list_timeline_blobs(&s3_client, id, &s3_root)
+                .await
+                .with_context(|| format!("List timeline {id} blobs"))?,
+        );
+    }
+
+    let mut branch_checks = JoinSet::new();
+    for (_, s3_active_project) in s3_active_projects {
+        let project_id = &s3_active_project.id;
+        let tenant_id = s3_active_project.tenant;
+
+        let mut console_active_branches =
+            branches_for_project_with_retries(&admin_client, project_id)
+                .await
+                .with_context(|| {
+                    format!("Client API branches for project {project_id:?} retrieval")
+                })?
+                .into_iter()
+                .map(|branch| (branch.id.clone(), branch))
+                .collect::<HashMap<_, _>>();
+
+        let active_branches = s3_active_branches_per_project
+            .remove(project_id)
+            .unwrap_or_default();
+        info!(
+            "Spawning tasks for {} tenant {} active timelines",
+            active_branches.len(),
+            tenant_id
+        );
+        for s3_active_branch in active_branches {
+            let console_branch = console_active_branches.remove(&s3_active_branch.id);
+            let timeline_id = s3_active_branch.timeline_id;
+            let id = TenantTimelineId::new(tenant_id, timeline_id);
+            let s3_data = s3_blob_data.remove(&id);
+            let s3_root = s3_root.clone();
+            branch_checks.spawn(
+                async move {
+                    let check_errors = branch_cleanup_and_check_errors(
+                        &id,
+                        &s3_root,
+                        Some(&s3_active_branch),
+                        console_branch,
+                        s3_data,
+                    )
+                    .await;
+                    (id, check_errors)
+                }
+                .instrument(info_span!("check_timeline", id = %id)),
+            );
+        }
+    }
+
+    let mut total_stats = BranchCheckStats::default();
+    while let Some((id, analysis)) = branch_checks
+        .join_next()
+        .await
+        .transpose()
+        .context("branch check task join")?
+    {
+        total_stats.add(id, analysis.errors);
+    }
+    Ok(total_stats)
+}
+
+async fn branches_for_project_with_retries(
+    admin_client: &CloudAdminApiClient,
+    project_id: &ProjectId,
+) -> anyhow::Result<Vec<BranchData>> {
+    for _ in 0..MAX_RETRIES {
+        match admin_client.branches_for_project(project_id, false).await {
+            Ok(branches) => return Ok(branches),
+            Err(e) => {
+                error!("admin list branches for project {project_id:?} query failed: {e}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+
+    anyhow::bail!("Failed to list branches for project {project_id:?} {MAX_RETRIES} times")
+}
+
+#[derive(Debug, Default)]
+pub struct BranchCheckStats {
+    pub timelines_with_errors: HashMap<TenantTimelineId, Vec<String>>,
+    pub normal_timelines: HashSet<TenantTimelineId>,
+}
+
+impl BranchCheckStats {
+    pub fn add(&mut self, id: TenantTimelineId, check_errors: Vec<String>) {
+        if check_errors.is_empty() {
+            if !self.normal_timelines.insert(id) {
+                panic!("Checking branch with timeline {id} more than once")
+            }
+        } else {
+            match self.timelines_with_errors.entry(id) {
+                hash_map::Entry::Occupied(_) => {
+                    panic!("Checking branch with timeline {id} more than once")
+                }
+                hash_map::Entry::Vacant(v) => {
+                    v.insert(check_errors);
+                }
+            }
+        }
+    }
+}
+
+pub struct TimelineAnalysis {
+    /// Anomalies detected
+    pub errors: Vec<String>,
+
+    /// Healthy-but-noteworthy, like old-versioned structures that are readable but
+    /// worth reporting for awareness that we must not remove that old version decoding
+    /// yet.
+    pub warnings: Vec<String>,
+
+    /// Keys not referenced in metadata: candidates for removal
+    pub garbage_keys: Vec<String>,
+}
+
+impl TimelineAnalysis {
+    fn new() -> Self {
+        Self {
+            errors: Vec::new(),
+            warnings: Vec::new(),
+            garbage_keys: Vec::new(),
+        }
+    }
+}
+
+pub async fn branch_cleanup_and_check_errors(
+    id: &TenantTimelineId,
+    s3_root: &RootTarget,
+    s3_active_branch: Option<&BranchData>,
+    console_branch: Option<BranchData>,
+    s3_data: Option<S3TimelineBlobData>,
+) -> TimelineAnalysis {
+    let mut result = TimelineAnalysis::new();
+
+    info!("Checking timeline {id}");
+
+    if let Some(s3_active_branch) = s3_active_branch {
+        info!(
+            "Checking console status for timeline for branch {:?}/{:?}",
+            s3_active_branch.project_id, s3_active_branch.id
+        );
+        match console_branch {
+            Some(_) => {result.errors.push(format!("Timeline has deleted branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
+                s3_active_branch.id, s3_active_branch.project_id))
+            },
+            None => {
+                result.errors.push(format!("Timeline has no branch data in the console (id = {:?}, project_id = {:?}), recheck whether it got removed during the check",
+            s3_active_branch.id, s3_active_branch.project_id))
+            }
+        };
+    }
+
+    match s3_data {
+        Some(s3_data) => {
+            result.garbage_keys.extend(s3_data.keys_to_remove);
+
+            match s3_data.blob_data {
+                BlobDataParseResult::Parsed {
+                    index_part,
+                    mut s3_layers,
+                } => {
+                    if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
+                        result.errors.push(format!(
+                            "index_part.json version: {}",
+                            index_part.get_version()
+                        ))
+                    }
+
+                    if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() {
+                        result.warnings.push(format!(
+                            "index_part.json version is not latest: {}",
+                            index_part.get_version()
+                        ))
+                    }
+
+                    if index_part.metadata.disk_consistent_lsn()
+                        != index_part.get_disk_consistent_lsn()
+                    {
+                        result.errors.push(format!(
+                                    "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})",
+                                    index_part.metadata.disk_consistent_lsn(),
+                                    index_part.get_disk_consistent_lsn(),
+
+                                ))
+                    }
+
+                    if index_part.layer_metadata.is_empty() {
+                        // not an error, can happen for branches with zero writes, but notice that
+                        info!("index_part.json has no layers");
+                    }
+
+                    for (layer, metadata) in index_part.layer_metadata {
+                        if metadata.file_size == 0 {
+                            result.errors.push(format!(
+                                            "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(),
+                                        ))
+                        }
+
+                        if !s3_layers.remove(&layer) {
+                            result.errors.push(format!(
+                                "index_part.json contains a layer {} that is not present in S3",
+                                layer.file_name(),
+                            ))
+                        }
+                    }
+
+                    if !s3_layers.is_empty() {
+                        result.errors.push(format!(
+                            "index_part.json does not contain layers from S3: {:?}",
+                            s3_layers
+                                .iter()
+                                .map(|layer_name| layer_name.file_name())
+                                .collect::<Vec<_>>(),
+                        ));
+                        result
+                            .garbage_keys
+                            .extend(s3_layers.iter().map(|layer_name| {
+                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
+                                let delimiter = s3_root.delimiter();
+                                if !key.ends_with(delimiter) {
+                                    key.push_str(delimiter);
+                                }
+                                key.push_str(&layer_name.file_name());
+                                key
+                            }));
+                    }
+                }
+                BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
+                    parse_errors
+                        .into_iter()
+                        .map(|error| format!("parse error: {error}")),
+                ),
+            }
+        }
+        None => result
+            .errors
+            .push("Timeline has no data on S3 at all".to_string()),
+    }
+
+    if result.errors.is_empty() {
+        info!("No check errors found");
+    } else {
+        warn!("Timeline metadata errors: {0:?}", result.errors);
+    }
+
+    if !result.warnings.is_empty() {
+        warn!("Timeline metadata warnings: {0:?}", result.warnings);
+    }
+
+    if !result.garbage_keys.is_empty() {
+        error!(
+            "The following keys should be removed from S3: {0:?}",
+            result.garbage_keys
+        )
+    }
+
+    result
+}
+
+#[derive(Debug)]
+pub struct S3TimelineBlobData {
+    pub blob_data: BlobDataParseResult,
+    pub keys_to_remove: Vec<String>,
+}
+
+#[derive(Debug)]
+pub enum BlobDataParseResult {
+    Parsed {
+        index_part: IndexPart,
+        s3_layers: HashSet<LayerFileName>,
+    },
+    Incorrect(Vec<String>),
+}
+
+pub async fn list_timeline_blobs(
+    s3_client: &Client,
+    id: TenantTimelineId,
+    s3_root: &RootTarget,
+) -> anyhow::Result<S3TimelineBlobData> {
+    let mut s3_layers = HashSet::new();
+    let mut index_part_object = None;
+
+    let timeline_dir_target = s3_root.timeline_root(&id);
+    let mut continuation_token = None;
+
+    let mut errors = Vec::new();
+    let mut keys_to_remove = Vec::new();
+
+    loop {
+        let fetch_response =
+            list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone())
+                .await?;
+
+        let subdirectories = fetch_response.common_prefixes().unwrap_or_default();
+        if !subdirectories.is_empty() {
+            errors.push(format!(
+                "S3 list response should not contain any subdirectories, but got {subdirectories:?}"
+            ));
+        }
+
+        for (object, key) in fetch_response
+            .contents()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|object| Some((object, object.key()?)))
+        {
+            let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
+            match blob_name {
+                Some("index_part.json") => index_part_object = Some(object.clone()),
+                Some(maybe_layer_name) => match maybe_layer_name.parse::<LayerFileName>() {
+                    Ok(new_layer) => {
+                        s3_layers.insert(new_layer);
+                    }
+                    Err(e) => {
+                        errors.push(
+                            format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
+                        );
+                        keys_to_remove.push(key.to_string());
+                    }
+                },
+                None => {
+                    errors.push(format!("S3 list response got an object with odd key {key}"));
+                    keys_to_remove.push(key.to_string());
+                }
+            }
+        }
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    if index_part_object.is_none() {
+        errors.push("S3 list response got no index_part.json file".to_string());
+    }
+
+    if let Some(index_part_object_key) = index_part_object.as_ref().and_then(|object| object.key())
+    {
+        let index_part_bytes = download_object_with_retries(
+            s3_client,
+            &timeline_dir_target.bucket_name,
+            index_part_object_key,
+        )
+        .await
+        .context("index_part.json download")?;
+
+        match serde_json::from_slice(&index_part_bytes) {
+            Ok(index_part) => {
+                return Ok(S3TimelineBlobData {
+                    blob_data: BlobDataParseResult::Parsed {
+                        index_part,
+                        s3_layers,
+                    },
+                    keys_to_remove,
+                })
+            }
+            Err(index_parse_error) => errors.push(format!(
+                "index_part.json body parsing error: {index_parse_error}"
+            )),
+        }
+    } else {
+        errors.push(format!(
+            "Index part object {index_part_object:?} has no key"
+        ));
+    }
+
+    if errors.is_empty() {
+        errors.push(
+            "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
+        );
+    }
+
+    Ok(S3TimelineBlobData {
+        blob_data: BlobDataParseResult::Incorrect(errors),
+        keys_to_remove,
+    })
+}
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -0,0 +1,418 @@
+#![allow(unused)]
+
+use chrono::{DateTime, Utc};
+use reqwest::{header, Client, Url};
+use tokio::sync::Semaphore;
+
+use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+
+#[derive(Debug)]
+pub struct Error {
+    context: String,
+    kind: ErrorKind,
+}
+
+impl Error {
+    fn new(context: String, kind: ErrorKind) -> Self {
+        Self { context, kind }
+    }
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.kind {
+            ErrorKind::RequestSend(e) => write!(
+                f,
+                "Failed to send a request. Context: {}, error: {}",
+                self.context, e
+            ),
+            ErrorKind::BodyRead(e) => {
+                write!(
+                    f,
+                    "Failed to read a request body. Context: {}, error: {}",
+                    self.context, e
+                )
+            }
+            ErrorKind::UnexpectedState => write!(f, "Unexpected state: {}", self.context),
+        }
+    }
+}
+
+#[derive(Debug, Clone, serde::Deserialize, Hash, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct ProjectId(pub String);
+
+#[derive(Clone, Debug, serde::Deserialize, Hash, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct BranchId(pub String);
+
+impl std::error::Error for Error {}
+
+#[derive(Debug)]
+pub enum ErrorKind {
+    RequestSend(reqwest::Error),
+    BodyRead(reqwest::Error),
+    UnexpectedState,
+}
+
+pub struct CloudAdminApiClient {
+    request_limiter: Semaphore,
+    token: String,
+    base_url: Url,
+    http_client: Client,
+}
+
+#[derive(Debug, serde::Deserialize)]
+struct AdminApiResponse<T> {
+    data: T,
+    total: Option<usize>,
+}
+
+#[derive(Debug, serde::Deserialize)]
+pub struct PageserverData {
+    pub id: u64,
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+    pub region_id: String,
+    pub version: i64,
+    pub instance_id: String,
+    pub port: u16,
+    pub http_host: String,
+    pub http_port: u16,
+    pub active: bool,
+    pub projects_count: usize,
+    pub availability_zone_id: String,
+}
+
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct SafekeeperData {
+    pub id: u64,
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+    pub region_id: String,
+    pub version: i64,
+    pub instance_id: String,
+    pub active: bool,
+    pub host: String,
+    pub port: u16,
+    pub projects_count: usize,
+    pub availability_zone_id: String,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct ProjectData {
+    pub id: ProjectId,
+    pub name: String,
+    pub region_id: String,
+    pub platform_id: String,
+    pub user_id: String,
+    pub pageserver_id: u64,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub tenant: TenantId,
+    pub safekeepers: Vec<SafekeeperData>,
+    pub deleted: bool,
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+    pub pg_version: u32,
+    pub max_project_size: u64,
+    pub remote_storage_size: u64,
+    pub resident_size: u64,
+    pub synthetic_storage_size: u64,
+    pub compute_time: u64,
+    pub data_transfer: u64,
+    pub data_storage: u64,
+    pub maintenance_set: Option<String>,
+}
+
+#[serde_with::serde_as]
+#[derive(Debug, serde::Deserialize)]
+pub struct BranchData {
+    pub id: BranchId,
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+    pub name: String,
+    pub project_id: ProjectId,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    #[serde(default)]
+    pub parent_id: Option<BranchId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    pub parent_lsn: Option<Lsn>,
+    pub default: bool,
+    pub deleted: bool,
+    pub logical_size: Option<u64>,
+    pub physical_size: Option<u64>,
+    pub written_size: Option<u64>,
+}
+
+impl CloudAdminApiClient {
+    pub fn new(token: String, base_url: Url) -> Self {
+        Self {
+            token,
+            base_url,
+            request_limiter: Semaphore::new(200),
+            http_client: Client::new(), // TODO timeout configs at least
+        }
+    }
+
+    pub async fn find_tenant_project(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<Option<ProjectData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/projects"))
+            .query(&[
+                ("tenant_id", tenant_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find project for tenant".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;
+
+        let response: AdminApiResponse<Vec<ProjectData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find project for tenant".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
+        match response.data.len() {
+            0 => Ok(None),
+            1 => Ok(Some(
+                response
+                    .data
+                    .into_iter()
+                    .next()
+                    .expect("Should have exactly one element"),
+            )),
+            too_many => Err(Error::new(
+                format!("Find project for tenant returned {too_many} projects instead of 0 or 1"),
+                ErrorKind::UnexpectedState,
+            )),
+        }
+    }
+
+    pub async fn find_timeline_branch(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Result<Option<BranchData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/branches"))
+            .query(&[
+                ("timeline_id", timeline_id.to_string()),
+                ("show_deleted", "true".to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| {
+                Error::new(
+                    "Find branch for timeline".to_string(),
+                    ErrorKind::RequestSend(e),
+                )
+            })?;
+
+        let response: AdminApiResponse<Vec<BranchData>> = response.json().await.map_err(|e| {
+            Error::new(
+                "Find branch for timeline".to_string(),
+                ErrorKind::BodyRead(e),
+            )
+        })?;
+        match response.data.len() {
+            0 => Ok(None),
+            1 => Ok(Some(
+                response
+                    .data
+                    .into_iter()
+                    .next()
+                    .expect("Should have exactly one element"),
+            )),
+            too_many => Err(Error::new(
+                format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"),
+                ErrorKind::UnexpectedState,
+            )),
+        }
+    }
+
+    pub async fn list_pageservers(&self) -> Result<Vec<PageserverData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/pageservers"))
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| Error::new("List pageservers".to_string(), ErrorKind::RequestSend(e)))?;
+
+        let response: AdminApiResponse<Vec<PageserverData>> = response
+            .json()
+            .await
+            .map_err(|e| Error::new("List pageservers".to_string(), ErrorKind::BodyRead(e)))?;
+
+        Ok(response.data)
+    }
+
+    pub async fn list_safekeepers(&self) -> Result<Vec<SafekeeperData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/safekeepers"))
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| Error::new("List safekeepers".to_string(), ErrorKind::RequestSend(e)))?;
+
+        let response: AdminApiResponse<Vec<SafekeeperData>> = response
+            .json()
+            .await
+            .map_err(|e| Error::new("List safekeepers".to_string(), ErrorKind::BodyRead(e)))?;
+
+        Ok(response.data)
+    }
+
+    pub async fn projects_for_pageserver(
+        &self,
+        pageserver_id: u64,
+        show_deleted: bool,
+    ) -> Result<Vec<ProjectData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/projects"))
+            .query(&[
+                ("pageserver_id", &pageserver_id.to_string()),
+                ("show_deleted", &show_deleted.to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?;
+
+        let response: AdminApiResponse<Vec<ProjectData>> = response
+            .json()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?;
+
+        Ok(response.data)
+    }
+
+    pub async fn project_for_tenant(
+        &self,
+        tenant_id: TenantId,
+        show_deleted: bool,
+    ) -> Result<Option<ProjectData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/projects"))
+            .query(&[
+                ("search", &tenant_id.to_string()),
+                ("show_deleted", &show_deleted.to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?;
+
+        let response: AdminApiResponse<Vec<ProjectData>> = response
+            .json()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?;
+
+        match response.data.as_slice() {
+            [] => Ok(None),
+            [_single] => Ok(Some(response.data.into_iter().next().unwrap())),
+            multiple => Err(Error::new(
+                format!("Got more than one project for tenant {tenant_id} : {multiple:?}"),
+                ErrorKind::UnexpectedState,
+            )),
+        }
+    }
+
+    pub async fn branches_for_project(
+        &self,
+        project_id: &ProjectId,
+        show_deleted: bool,
+    ) -> Result<Vec<BranchData>, Error> {
+        let _permit = self
+            .request_limiter
+            .acquire()
+            .await
+            .expect("Semaphore is not closed");
+
+        let response = self
+            .http_client
+            .get(self.append_url("/branches"))
+            .query(&[
+                ("project_id", &project_id.0),
+                ("show_deleted", &show_deleted.to_string()),
+            ])
+            .header(header::ACCEPT, "application/json")
+            .bearer_auth(&self.token)
+            .send()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::RequestSend(e)))?;
+
+        let response: AdminApiResponse<Vec<BranchData>> = response
+            .json()
+            .await
+            .map_err(|e| Error::new("Project for tenant".to_string(), ErrorKind::BodyRead(e)))?;
+
+        Ok(response.data)
+    }
+
+    fn append_url(&self, subpath: &str) -> Url {
+        // TODO fugly, but `.join` does not work when called
+        (self.base_url.to_string() + subpath)
+            .parse()
+            .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}"))
+    }
+}
--- a/s3_scrubber/src/delete_batch_producer.rs
+++ b/s3_scrubber/src/delete_batch_producer.rs
@@ -0,0 +1,354 @@
+mod tenant_batch;
+mod timeline_batch;
+
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::Mutex;
+use tokio::task::{JoinHandle, JoinSet};
+use tracing::{error, info, info_span, Instrument};
+
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
+use crate::{list_objects_with_retries, RootTarget, S3Target, TraversingDepth, MAX_RETRIES};
+use utils::id::{TenantId, TenantTimelineId};
+
+/// Typical tenant to remove contains 1 layer and 1 index_part.json blobs
+/// Also, there are some non-standard tenants to remove, having more layers.
+/// delete_objects request allows up to 1000 keys, so be on a safe side and allow most
+/// batch processing tasks to do 1 delete objects request only.
+///
+/// Every batch item will be additionally S3 LS'ed later, so keep the batch size
+/// even lower to allow multiple concurrent tasks do the LS requests.
+const BATCH_SIZE: usize = 100;
+
+pub struct DeleteBatchProducer {
+    delete_tenants_sender_task: JoinHandle<anyhow::Result<ProcessedS3List<TenantId, ProjectData>>>,
+    delete_timelines_sender_task:
+        JoinHandle<anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>>>,
+    delete_batch_creator_task: JoinHandle<()>,
+    delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
+}
+
+pub struct DeleteProducerStats {
+    pub tenant_stats: ProcessedS3List<TenantId, ProjectData>,
+    pub timeline_stats: Option<ProcessedS3List<TenantTimelineId, BranchData>>,
+}
+
+impl DeleteProducerStats {
+    pub fn tenants_checked(&self) -> usize {
+        self.tenant_stats.entries_total
+    }
+
+    pub fn active_tenants(&self) -> usize {
+        self.tenant_stats.active_entries.len()
+    }
+
+    pub fn timelines_checked(&self) -> usize {
+        self.timeline_stats
+            .as_ref()
+            .map(|stats| stats.entries_total)
+            .unwrap_or(0)
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DeleteBatch {
+    pub tenants: Vec<TenantId>,
+    pub timelines: Vec<TenantTimelineId>,
+}
+
+impl DeleteBatch {
+    pub fn merge(&mut self, other: Self) {
+        self.tenants.extend(other.tenants);
+        self.timelines.extend(other.timelines);
+    }
+
+    pub fn len(&self) -> usize {
+        self.tenants.len() + self.timelines.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+impl DeleteBatchProducer {
+    pub fn start(
+        admin_client: Arc<CloudAdminApiClient>,
+        s3_client: Arc<Client>,
+        s3_root_target: RootTarget,
+        traversing_depth: TraversingDepth,
+    ) -> Self {
+        let (delete_elements_sender, mut delete_elements_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let delete_elements_sender = Arc::new(delete_elements_sender);
+        let admin_client = Arc::new(admin_client);
+
+        let (projects_to_check_sender, mut projects_to_check_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let delete_tenants_root_target = s3_root_target.clone();
+        let delete_tenants_client = Arc::clone(&s3_client);
+        let delete_tenants_admin_client = Arc::clone(&admin_client);
+        let delete_sender = Arc::clone(&delete_elements_sender);
+        let delete_tenants_sender_task = tokio::spawn(
+            async move {
+                tenant_batch::schedule_cleanup_deleted_tenants(
+                    &delete_tenants_root_target,
+                    &delete_tenants_client,
+                    &delete_tenants_admin_client,
+                    projects_to_check_sender,
+                    delete_sender,
+                    traversing_depth,
+                )
+                .await
+            }
+            .instrument(info_span!("delete_tenants_sender")),
+        );
+        let delete_timelines_sender_task = tokio::spawn(async move {
+            timeline_batch::schedule_cleanup_deleted_timelines(
+                &s3_root_target,
+                &s3_client,
+                &admin_client,
+                &mut projects_to_check_receiver,
+                delete_elements_sender,
+            )
+            .in_current_span()
+            .await
+        });
+
+        let (delete_batch_sender, delete_batch_receiver) = tokio::sync::mpsc::unbounded_channel();
+        let delete_batch_creator_task = tokio::spawn(
+            async move {
+                'outer: loop {
+                    let mut delete_batch = DeleteBatch::default();
+                    while delete_batch.len() < BATCH_SIZE {
+                        match delete_elements_receiver.recv().await {
+                            Some(new_task) => match new_task {
+                                Either::Left(tenant_id) => delete_batch.tenants.push(tenant_id),
+                                Either::Right(timeline_id) => {
+                                    delete_batch.timelines.push(timeline_id)
+                                }
+                            },
+                            None => {
+                                info!("Task finished: sender dropped");
+                                delete_batch_sender.send(delete_batch).ok();
+                                break 'outer;
+                            }
+                        }
+                    }
+
+                    if !delete_batch.is_empty() {
+                        delete_batch_sender.send(delete_batch).ok();
+                    }
+                }
+            }
+            .instrument(info_span!("delete batch creator")),
+        );
+
+        Self {
+            delete_tenants_sender_task,
+            delete_timelines_sender_task,
+            delete_batch_creator_task,
+            delete_batch_receiver: Arc::new(Mutex::new(delete_batch_receiver)),
+        }
+    }
+
+    pub fn subscribe(&self) -> Arc<Mutex<UnboundedReceiver<DeleteBatch>>> {
+        self.delete_batch_receiver.clone()
+    }
+
+    pub async fn join(self) -> anyhow::Result<DeleteProducerStats> {
+        let (delete_tenants_task_result, delete_timelines_task_result, batch_task_result) = tokio::join!(
+            self.delete_tenants_sender_task,
+            self.delete_timelines_sender_task,
+            self.delete_batch_creator_task,
+        );
+
+        let tenant_stats = match delete_tenants_task_result {
+            Ok(Ok(stats)) => stats,
+            Ok(Err(tenant_deletion_error)) => return Err(tenant_deletion_error),
+            Err(join_error) => {
+                anyhow::bail!("Failed to join the delete tenant producing task: {join_error}")
+            }
+        };
+
+        let timeline_stats = match delete_timelines_task_result {
+            Ok(Ok(stats)) => Some(stats),
+            Ok(Err(timeline_deletion_error)) => return Err(timeline_deletion_error),
+            Err(join_error) => {
+                anyhow::bail!("Failed to join the delete timeline producing task: {join_error}")
+            }
+        };
+
+        match batch_task_result {
+            Ok(()) => (),
+            Err(join_error) => anyhow::bail!("Failed to join the batch forming task: {join_error}"),
+        };
+
+        Ok(DeleteProducerStats {
+            tenant_stats,
+            timeline_stats,
+        })
+    }
+}
+
+pub struct ProcessedS3List<I, A> {
+    pub entries_total: usize,
+    pub entries_to_delete: Vec<I>,
+    pub active_entries: Vec<A>,
+}
+
+impl<I, A> Default for ProcessedS3List<I, A> {
+    fn default() -> Self {
+        Self {
+            entries_total: 0,
+            entries_to_delete: Vec::new(),
+            active_entries: Vec::new(),
+        }
+    }
+}
+
+impl<I, A> ProcessedS3List<I, A> {
+    fn merge(&mut self, other: Self) {
+        self.entries_total += other.entries_total;
+        self.entries_to_delete.extend(other.entries_to_delete);
+        self.active_entries.extend(other.active_entries);
+    }
+
+    fn change_ids<NewI>(self, transform: impl Fn(I) -> NewI) -> ProcessedS3List<NewI, A> {
+        ProcessedS3List {
+            entries_total: self.entries_total,
+            entries_to_delete: self.entries_to_delete.into_iter().map(transform).collect(),
+            active_entries: self.active_entries,
+        }
+    }
+}
+
+async fn process_s3_target_recursively<F, Fut, I, E, A>(
+    s3_client: &Client,
+    target: &S3Target,
+    find_active_and_deleted_entries: F,
+) -> anyhow::Result<ProcessedS3List<I, A>>
+where
+    I: FromStr<Err = E> + Send + Sync,
+    E: Send + Sync + std::error::Error + 'static,
+    F: FnOnce(Vec<I>) -> Fut + Clone,
+    Fut: Future<Output = anyhow::Result<ProcessedS3List<I, A>>>,
+{
+    let mut continuation_token = None;
+    let mut total_entries = ProcessedS3List::default();
+
+    loop {
+        let fetch_response =
+            list_objects_with_retries(s3_client, target, continuation_token.clone()).await?;
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&target.prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                entry_id_str
+                    .parse()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            })
+            .collect::<anyhow::Result<Vec<I>>>()
+            .context("list and parse bucket's entry ids")?;
+
+        total_entries.merge(
+            (find_active_and_deleted_entries.clone())(new_entry_ids)
+                .await
+                .context("filter active and deleted entry ids")?,
+        );
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    Ok(total_entries)
+}
+
+enum FetchResult<A> {
+    Found(A),
+    Deleted,
+    Absent,
+}
+
+async fn split_to_active_and_deleted_entries<I, A, F, Fut>(
+    new_entry_ids: Vec<I>,
+    find_active_entry: F,
+) -> anyhow::Result<ProcessedS3List<I, A>>
+where
+    I: std::fmt::Display + Send + Sync + 'static + Copy,
+    A: Send + 'static,
+    F: FnOnce(I) -> Fut + Send + Sync + 'static + Clone,
+    Fut: Future<Output = anyhow::Result<FetchResult<A>>> + Send,
+{
+    let entries_total = new_entry_ids.len();
+    let mut check_tasks = JoinSet::new();
+    let mut active_entries = Vec::with_capacity(entries_total);
+    let mut entries_to_delete = Vec::with_capacity(entries_total);
+
+    for new_entry_id in new_entry_ids {
+        let check_closure = find_active_entry.clone();
+        check_tasks.spawn(
+            async move {
+                (
+                    new_entry_id,
+                    async {
+                        for _ in 0..MAX_RETRIES {
+                            let closure_clone = check_closure.clone();
+                            match closure_clone(new_entry_id).await {
+                                Ok(active_entry) => return Ok(active_entry),
+                                Err(e) => {
+                                    error!("find active entry admin API call failed: {e}");
+                                    tokio::time::sleep(Duration::from_secs(1)).await;
+                                }
+                            }
+                        }
+
+                        anyhow::bail!("Failed to check entry {new_entry_id} {MAX_RETRIES} times")
+                    }
+                    .await,
+                )
+            }
+            .instrument(info_span!("filter_active_entries")),
+        );
+    }
+
+    while let Some(task_result) = check_tasks.join_next().await {
+        let (entry_id, entry_data_fetch_result) = task_result.context("task join")?;
+        match entry_data_fetch_result.context("entry data fetch")? {
+            FetchResult::Found(active_entry) => {
+                info!("Entry {entry_id} is alive, cannot delete");
+                active_entries.push(active_entry);
+            }
+            FetchResult::Deleted => {
+                info!("Entry {entry_id} deleted in the admin data, can safely delete");
+                entries_to_delete.push(entry_id);
+            }
+            FetchResult::Absent => {
+                info!("Entry {entry_id} absent in the admin data, can safely delete");
+                entries_to_delete.push(entry_id);
+            }
+        }
+    }
+    Ok(ProcessedS3List {
+        entries_total,
+        entries_to_delete,
+        active_entries,
+    })
+}
--- a/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/tenant_batch.rs
@@ -0,0 +1,87 @@
+use std::sync::Arc;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::UnboundedSender;
+use tracing::info;
+
+use crate::cloud_admin_api::{CloudAdminApiClient, ProjectData};
+use crate::delete_batch_producer::FetchResult;
+use crate::{RootTarget, TraversingDepth};
+use utils::id::{TenantId, TenantTimelineId};
+
+use super::ProcessedS3List;
+
+pub async fn schedule_cleanup_deleted_tenants(
+    s3_root_target: &RootTarget,
+    s3_client: &Arc<Client>,
+    admin_client: &Arc<CloudAdminApiClient>,
+    projects_to_check_sender: UnboundedSender<ProjectData>,
+    delete_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
+    traversing_depth: TraversingDepth,
+) -> anyhow::Result<ProcessedS3List<TenantId, ProjectData>> {
+    info!(
+        "Starting to list the bucket from root {}",
+        s3_root_target.bucket_name()
+    );
+    s3_client
+        .head_bucket()
+        .bucket(s3_root_target.bucket_name())
+        .send()
+        .await
+        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
+
+    let check_client = Arc::clone(admin_client);
+    let tenant_stats = super::process_s3_target_recursively(
+        s3_client,
+        s3_root_target.tenants_root(),
+        |s3_tenants| async move {
+            let another_client = Arc::clone(&check_client);
+            super::split_to_active_and_deleted_entries(s3_tenants, move |tenant_id| async move {
+                let project_data = another_client
+                    .find_tenant_project(tenant_id)
+                    .await
+                    .with_context(|| format!("Tenant {tenant_id} project admin check"))?;
+
+                Ok(if let Some(console_project) = project_data {
+                    if console_project.deleted {
+                        delete_sender.send(Either::Left(tenant_id)).ok();
+                        FetchResult::Deleted
+                    } else {
+                        if traversing_depth == TraversingDepth::Timeline {
+                            projects_to_check_sender.send(console_project.clone()).ok();
+                        }
+                        FetchResult::Found(console_project)
+                    }
+                } else {
+                    delete_sender.send(Either::Left(tenant_id)).ok();
+                    FetchResult::Absent
+                })
+            })
+            .await
+        },
+    )
+    .await
+    .context("tenant batch processing")?;
+
+    info!(
+        "Among {} tenants, found {} tenants to delete and {} active ones",
+        tenant_stats.entries_total,
+        tenant_stats.entries_to_delete.len(),
+        tenant_stats.active_entries.len(),
+    );
+
+    let tenant_stats = match traversing_depth {
+        TraversingDepth::Tenant => {
+            info!("Finished listing the bucket for tenants only");
+            tenant_stats
+        }
+        TraversingDepth::Timeline => {
+            info!("Finished listing the bucket for tenants and sent {} active tenants to check for timelines", tenant_stats.active_entries.len());
+            tenant_stats
+        }
+    };
+
+    Ok(tenant_stats)
+}
--- a/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
+++ b/s3_scrubber/src/delete_batch_producer/timeline_batch.rs
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+
+use anyhow::Context;
+use aws_sdk_s3::Client;
+use either::Either;
+use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender};
+use tracing::{info, info_span, Instrument};
+
+use crate::cloud_admin_api::{BranchData, CloudAdminApiClient, ProjectData};
+use crate::delete_batch_producer::{FetchResult, ProcessedS3List};
+use crate::RootTarget;
+use utils::id::{TenantId, TenantTimelineId};
+
+pub async fn schedule_cleanup_deleted_timelines(
+    s3_root_target: &RootTarget,
+    s3_client: &Arc<Client>,
+    admin_client: &Arc<CloudAdminApiClient>,
+    projects_to_check_receiver: &mut UnboundedReceiver<ProjectData>,
+    delete_elements_sender: Arc<UnboundedSender<Either<TenantId, TenantTimelineId>>>,
+) -> anyhow::Result<ProcessedS3List<TenantTimelineId, BranchData>> {
+    info!(
+        "Starting to list the bucket from root {}",
+        s3_root_target.bucket_name()
+    );
+    s3_client
+        .head_bucket()
+        .bucket(s3_root_target.bucket_name())
+        .send()
+        .await
+        .with_context(|| format!("bucket {} was not found", s3_root_target.bucket_name()))?;
+
+    let mut timeline_stats = ProcessedS3List::default();
+    while let Some(project_to_check) = projects_to_check_receiver.recv().await {
+        let check_client = Arc::clone(admin_client);
+
+        let check_s3_client = Arc::clone(s3_client);
+
+        let check_delete_sender = Arc::clone(&delete_elements_sender);
+
+        let check_root = s3_root_target.clone();
+
+        let new_stats = async move {
+            let tenant_id_to_check = project_to_check.tenant;
+            let check_target = check_root.timelines_root(&tenant_id_to_check);
+            let stats = super::process_s3_target_recursively(
+                &check_s3_client,
+                &check_target,
+                |s3_timelines| async move {
+                    let another_client = check_client.clone();
+                    super::split_to_active_and_deleted_entries(
+                        s3_timelines,
+                        move |timeline_id| async move {
+                            let console_branch = another_client
+                                .find_timeline_branch(timeline_id)
+                                .await
+                                .map_err(|e| {
+                                    anyhow::anyhow!(
+                                        "Timeline {timeline_id} branch admin check: {e}"
+                                    )
+                                })?;
+
+                            let id = TenantTimelineId::new(tenant_id_to_check, timeline_id);
+                            Ok(match console_branch {
+                                Some(console_branch) => {
+                                    if console_branch.deleted {
+                                        check_delete_sender.send(Either::Right(id)).ok();
+                                        FetchResult::Deleted
+                                    } else {
+                                        FetchResult::Found(console_branch)
+                                    }
+                                }
+                                None => {
+                                    check_delete_sender.send(Either::Right(id)).ok();
+                                    FetchResult::Absent
+                                }
+                            })
+                        },
+                    )
+                    .await
+                },
+            )
+            .await
+            .with_context(|| format!("tenant {tenant_id_to_check} timeline batch processing"))?
+            .change_ids(|timeline_id| TenantTimelineId::new(tenant_id_to_check, timeline_id));
+
+            Ok::<_, anyhow::Error>(stats)
+        }
+        .instrument(info_span!("delete_timelines_sender", tenant = %project_to_check.tenant))
+        .await?;
+
+        timeline_stats.merge(new_stats);
+    }
+
+    info!(
+        "Among {} timelines, found {} timelines to delete and {} active ones",
+        timeline_stats.entries_total,
+        timeline_stats.entries_to_delete.len(),
+        timeline_stats.active_entries.len(),
+    );
+
+    Ok(timeline_stats)
+}
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -0,0 +1,298 @@
+pub mod checks;
+pub mod cloud_admin_api;
+pub mod delete_batch_producer;
+pub mod metadata_stream;
+mod s3_deletion;
+pub mod scan_metadata;
+
+use std::env;
+use std::fmt::Display;
+use std::time::Duration;
+
+use anyhow::Context;
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
+use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::sso::SsoCredentialsProvider;
+use aws_sdk_s3::config::Region;
+use aws_sdk_s3::{Client, Config};
+
+use reqwest::Url;
+pub use s3_deletion::S3Deleter;
+use tokio::io::AsyncReadExt;
+use tracing::error;
+use tracing_appender::non_blocking::WorkerGuard;
+use tracing_subscriber::{fmt, prelude::*, EnvFilter};
+use utils::id::{TenantId, TenantTimelineId};
+
+const MAX_RETRIES: usize = 20;
+const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
+
+pub const CLI_NAME: &str = "s3-scrubber";
+
+#[derive(Debug, Clone)]
+pub struct S3Target {
+    pub bucket_name: String,
+    pub prefix_in_bucket: String,
+    pub delimiter: String,
+}
+
+#[derive(clap::ValueEnum, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TraversingDepth {
+    Tenant,
+    Timeline,
+}
+
+impl Display for TraversingDepth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match self {
+            Self::Tenant => "tenant",
+            Self::Timeline => "timeline",
+        })
+    }
+}
+
+impl S3Target {
+    pub fn with_sub_segment(&self, new_segment: &str) -> Self {
+        let mut new_self = self.clone();
+        let _ = new_self.prefix_in_bucket.pop();
+        new_self.prefix_in_bucket =
+            [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
+        new_self
+    }
+}
+
+#[derive(Clone)]
+pub enum RootTarget {
+    Pageserver(S3Target),
+    Safekeeper(S3Target),
+}
+
+impl RootTarget {
+    pub fn tenants_root(&self) -> &S3Target {
+        match self {
+            Self::Pageserver(root) => root,
+            Self::Safekeeper(root) => root,
+        }
+    }
+
+    pub fn tenant_root(&self, tenant_id: &TenantId) -> S3Target {
+        self.tenants_root().with_sub_segment(&tenant_id.to_string())
+    }
+
+    pub fn timelines_root(&self, tenant_id: &TenantId) -> S3Target {
+        match self {
+            Self::Pageserver(_) => self.tenant_root(tenant_id).with_sub_segment("timelines"),
+            Self::Safekeeper(_) => self.tenant_root(tenant_id),
+        }
+    }
+
+    pub fn timeline_root(&self, id: &TenantTimelineId) -> S3Target {
+        self.timelines_root(&id.tenant_id)
+            .with_sub_segment(&id.timeline_id.to_string())
+    }
+
+    pub fn bucket_name(&self) -> &str {
+        match self {
+            Self::Pageserver(root) => &root.bucket_name,
+            Self::Safekeeper(root) => &root.bucket_name,
+        }
+    }
+
+    pub fn delimiter(&self) -> &str {
+        match self {
+            Self::Pageserver(root) => &root.delimiter,
+            Self::Safekeeper(root) => &root.delimiter,
+        }
+    }
+}
+
+pub struct BucketConfig {
+    pub region: String,
+    pub bucket: String,
+
+    /// Use SSO if this is set, else rely on AWS_* environment vars
+    pub sso_account_id: Option<String>,
+}
+
+impl Display for BucketConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}/{}/{}",
+            self.sso_account_id.as_deref().unwrap_or("<none>"),
+            self.region,
+            self.bucket
+        )
+    }
+}
+
+impl BucketConfig {
+    pub fn from_env() -> anyhow::Result<Self> {
+        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
+        let region = env::var("REGION").context("'REGION' param retrieval")?;
+        let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
+
+        Ok(Self {
+            region,
+            bucket,
+            sso_account_id,
+        })
+    }
+}
+
+pub struct ConsoleConfig {
+    pub admin_api_url: Url,
+}
+
+impl ConsoleConfig {
+    pub fn from_env() -> anyhow::Result<Self> {
+        let admin_api_url: Url = env::var("CLOUD_ADMIN_API_URL")
+            .context("'CLOUD_ADMIN_API_URL' param retrieval")?
+            .parse()
+            .context("'CLOUD_ADMIN_API_URL' param parsing")?;
+
+        Ok(Self { admin_api_url })
+    }
+}
+
+pub fn get_cloud_admin_api_token_or_exit() -> String {
+    match env::var(CLOUD_ADMIN_API_TOKEN_ENV_VAR) {
+        Ok(token) => token,
+        Err(env::VarError::NotPresent) => {
+            error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable is not present");
+            std::process::exit(1);
+        }
+        Err(env::VarError::NotUnicode(not_unicode_string)) => {
+            error!("{CLOUD_ADMIN_API_TOKEN_ENV_VAR} env variable's value is not a valid unicode string: {not_unicode_string:?}");
+            std::process::exit(1);
+        }
+    }
+}
+
+pub fn init_logging(file_name: &str) -> WorkerGuard {
+    let (file_writer, guard) =
+        tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
+
+    let file_logs = fmt::Layer::new()
+        .with_target(false)
+        .with_ansi(false)
+        .with_writer(file_writer);
+    let stdout_logs = fmt::Layer::new()
+        .with_target(false)
+        .with_writer(std::io::stdout);
+    tracing_subscriber::registry()
+        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+        .with(file_logs)
+        .with(stdout_logs)
+        .init();
+
+    guard
+}
+
+pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
+    let credentials_provider = {
+        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+        let chain = CredentialsProviderChain::first_try(
+            "env",
+            EnvironmentVariableCredentialsProvider::new(),
+        );
+
+        // Use SSO if we were given an account ID
+        match account_id {
+            Some(sso_account) => chain.or_else(
+                "sso",
+                SsoCredentialsProvider::builder()
+                    .account_id(sso_account)
+                    .role_name("PowerUserAccess")
+                    .start_url("https://neondb.awsapps.com/start")
+                    .region(Region::from_static("eu-central-1"))
+                    .build(),
+            ),
+            None => chain,
+        }
+        .or_else(
+            // Finally try IMDS
+            "imds",
+            ImdsCredentialsProvider::builder().build(),
+        )
+    };
+
+    let mut builder = Config::builder()
+        .region(bucket_region)
+        .credentials_provider(credentials_provider);
+
+    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
+        builder = builder.endpoint_url(endpoint)
+    }
+
+    Client::from_conf(builder.build())
+}
+
+async fn list_objects_with_retries(
+    s3_client: &Client,
+    s3_target: &S3Target,
+    continuation_token: Option<String>,
+) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
+    for _ in 0..MAX_RETRIES {
+        match s3_client
+            .list_objects_v2()
+            .bucket(&s3_target.bucket_name)
+            .prefix(&s3_target.prefix_in_bucket)
+            .delimiter(&s3_target.delimiter)
+            .set_continuation_token(continuation_token.clone())
+            .send()
+            .await
+        {
+            Ok(response) => return Ok(response),
+            Err(e) => {
+                error!("list_objects_v2 query failed: {e}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+
+    anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
+}
+
+async fn download_object_with_retries(
+    s3_client: &Client,
+    bucket_name: &str,
+    key: &str,
+) -> anyhow::Result<Vec<u8>> {
+    for _ in 0..MAX_RETRIES {
+        let mut body_buf = Vec::new();
+        let response_stream = match s3_client
+            .get_object()
+            .bucket(bucket_name)
+            .key(key)
+            .send()
+            .await
+        {
+            Ok(response) => response,
+            Err(e) => {
+                error!("Failed to download object for key {key}: {e}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+                continue;
+            }
+        };
+
+        match response_stream
+            .body
+            .into_async_read()
+            .read_to_end(&mut body_buf)
+            .await
+        {
+            Ok(bytes_read) => {
+                tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}");
+                return Ok(body_buf);
+            }
+            Err(e) => {
+                error!("Failed to stream object body for key {key}: {e}");
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    }
+
+    anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times")
+}
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -0,0 +1,251 @@
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::num::NonZeroUsize;
+use std::sync::Arc;
+
+use anyhow::Context;
+use aws_sdk_s3::config::Region;
+use s3_scrubber::cloud_admin_api::CloudAdminApiClient;
+use s3_scrubber::delete_batch_producer::DeleteBatchProducer;
+use s3_scrubber::scan_metadata::scan_metadata;
+use s3_scrubber::{
+    checks, get_cloud_admin_api_token_or_exit, init_logging, init_s3_client, BucketConfig,
+    ConsoleConfig, RootTarget, S3Deleter, S3Target, TraversingDepth, CLI_NAME,
+};
+use tracing::{info, warn};
+
+use clap::{Parser, Subcommand, ValueEnum};
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    #[command(subcommand)]
+    command: Command,
+
+    #[arg(short, long, default_value_t = false)]
+    delete: bool,
+}
+
+#[derive(ValueEnum, Clone, Copy, Eq, PartialEq)]
+enum NodeKind {
+    Safekeeper,
+    Pageserver,
+}
+
+impl NodeKind {
+    fn as_str(&self) -> &'static str {
+        match self {
+            Self::Safekeeper => "safekeeper",
+            Self::Pageserver => "pageserver",
+        }
+    }
+}
+
+impl Display for NodeKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(Subcommand)]
+enum Command {
+    Tidy {
+        #[arg(short, long)]
+        node_kind: NodeKind,
+        #[arg(short, long, default_value_t=TraversingDepth::Tenant)]
+        depth: TraversingDepth,
+        #[arg(short, long, default_value_t = false)]
+        skip_validation: bool,
+    },
+    ScanMetadata {},
+}
+
+async fn tidy(
+    cli: &Cli,
+    bucket_config: BucketConfig,
+    console_config: ConsoleConfig,
+    node_kind: NodeKind,
+    depth: TraversingDepth,
+    skip_validation: bool,
+) -> anyhow::Result<()> {
+    let dry_run = !cli.delete;
+    let file_name = if dry_run {
+        format!(
+            "{}_{}_{}__dry.log",
+            CLI_NAME,
+            node_kind,
+            chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
+        )
+    } else {
+        format!(
+            "{}_{}_{}.log",
+            CLI_NAME,
+            node_kind,
+            chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
+        )
+    };
+
+    let _guard = init_logging(&file_name);
+
+    if dry_run {
+        info!("Dry run, not removing items for real");
+    } else {
+        warn!("Dry run disabled, removing bucket items for real");
+    }
+
+    info!("skip_validation={skip_validation}");
+
+    info!("Starting extra S3 removal in {bucket_config} for node kind '{node_kind}', traversing depth: {depth:?}");
+
+    info!("Starting extra tenant S3 removal in {bucket_config} for node kind '{node_kind}'");
+    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(
+        get_cloud_admin_api_token_or_exit(),
+        console_config.admin_api_url,
+    ));
+
+    let bucket_region = Region::new(bucket_config.region);
+    let delimiter = "/".to_string();
+    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_root = match node_kind {
+        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: ["pageserver", "v1", "tenants", ""].join(&delimiter),
+            delimiter,
+        }),
+        NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
+            bucket_name: bucket_config.bucket,
+            prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
+            delimiter,
+        }),
+    };
+
+    let delete_batch_producer = DeleteBatchProducer::start(
+        Arc::clone(&cloud_admin_api_client),
+        Arc::clone(&s3_client),
+        s3_root.clone(),
+        depth,
+    );
+
+    let s3_deleter = S3Deleter::new(
+        dry_run,
+        NonZeroUsize::new(15).unwrap(),
+        Arc::clone(&s3_client),
+        delete_batch_producer.subscribe(),
+        s3_root.clone(),
+    );
+
+    let (deleter_task_result, batch_producer_task_result) =
+        tokio::join!(s3_deleter.remove_all(), delete_batch_producer.join());
+
+    let deletion_stats = deleter_task_result.context("s3 deletion")?;
+    info!(
+        "Deleted {} tenants ({} keys) and {} timelines ({} keys) total. Dry run: {}",
+        deletion_stats.deleted_tenant_keys.len(),
+        deletion_stats.deleted_tenant_keys.values().sum::<usize>(),
+        deletion_stats.deleted_timeline_keys.len(),
+        deletion_stats.deleted_timeline_keys.values().sum::<usize>(),
+        dry_run,
+    );
+    info!(
+        "Total tenant deletion stats: {:?}",
+        deletion_stats
+            .deleted_tenant_keys
+            .into_iter()
+            .map(|(id, key)| (id.to_string(), key))
+            .collect::<HashMap<_, _>>()
+    );
+    info!(
+        "Total timeline deletion stats: {:?}",
+        deletion_stats
+            .deleted_timeline_keys
+            .into_iter()
+            .map(|(id, key)| (id.to_string(), key))
+            .collect::<HashMap<_, _>>()
+    );
+
+    let batch_producer_stats = batch_producer_task_result.context("delete batch producer join")?;
+    info!(
+        "Total bucket tenants listed: {}; for {} active tenants, timelines checked: {}",
+        batch_producer_stats.tenants_checked(),
+        batch_producer_stats.active_tenants(),
+        batch_producer_stats.timelines_checked()
+    );
+
+    if node_kind == NodeKind::Pageserver {
+        info!("node_kind != pageserver, finish without performing validation step");
+        return Ok(());
+    }
+
+    if skip_validation {
+        info!("--skip-validation is set, exiting");
+        return Ok(());
+    }
+
+    info!("validating active tenants and timelines for pageserver S3 data");
+
+    // TODO kb real stats for validation + better stats for every place: add and print `min`, `max`, `mean` values at least
+    let validation_stats = checks::validate_pageserver_active_tenant_and_timelines(
+        s3_client,
+        s3_root,
+        cloud_admin_api_client,
+        batch_producer_stats,
+    )
+    .await
+    .context("active tenant and timeline validation")?;
+    info!("Finished active tenant and timeline validation, correct timelines: {}, timeline validation errors: {}",
+        validation_stats.normal_timelines.len(), validation_stats.timelines_with_errors.len());
+    if !validation_stats.timelines_with_errors.is_empty() {
+        warn!(
+            "Validation errors: {:#?}",
+            validation_stats
+                .timelines_with_errors
+                .into_iter()
+                .map(|(id, errors)| (id.to_string(), format!("{errors:?}")))
+                .collect::<HashMap<_, _>>()
+        );
+    }
+
+    info!("Done");
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    let bucket_config = BucketConfig::from_env()?;
+
+    match cli.command {
+        Command::Tidy {
+            node_kind,
+            depth,
+            skip_validation,
+        } => {
+            let console_config = ConsoleConfig::from_env()?;
+            tidy(
+                &cli,
+                bucket_config,
+                console_config,
+                node_kind,
+                depth,
+                skip_validation,
+            )
+            .await
+        }
+        Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
+            Err(e) => {
+                tracing::error!("Failed: {e}");
+                Err(e)
+            }
+            Ok(summary) => {
+                println!("{}", summary.summary_string());
+                if summary.is_fatal() {
+                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                } else {
+                    Ok(())
+                }
+            }
+        },
+    }
+}
--- a/s3_scrubber/src/metadata_stream.rs
+++ b/s3_scrubber/src/metadata_stream.rs
@@ -0,0 +1,106 @@
+use anyhow::Context;
+use async_stream::{stream, try_stream};
+use aws_sdk_s3::Client;
+use tokio_stream::Stream;
+
+use crate::{list_objects_with_retries, RootTarget, TenantId};
+use utils::id::{TenantTimelineId, TimelineId};
+
+/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
+pub fn stream_tenants<'a>(
+    s3_client: &'a Client,
+    target: &'a RootTarget,
+) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
+    try_stream! {
+        let mut continuation_token = None;
+        loop {
+            let tenants_target = target.tenants_root();
+            let fetch_response =
+                list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;
+
+            let new_entry_ids = fetch_response
+                .common_prefixes()
+                .unwrap_or_default()
+                .iter()
+                .filter_map(|prefix| prefix.prefix())
+                .filter_map(|prefix| -> Option<&str> {
+                    prefix
+                        .strip_prefix(&tenants_target.prefix_in_bucket)?
+                        .strip_suffix('/')
+                }).map(|entry_id_str| {
+                entry_id_str
+                    .parse()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            });
+
+            for i in new_entry_ids {
+                yield i?;
+            }
+
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+    }
+}
+
+/// Given a TenantId, output a stream of the timelines within that tenant, discovered
+/// using ListObjectsv2.  The listing is done before the stream is built, so that this
+/// function can be used to generate concurrency on a stream using buffer_unordered.
+pub async fn stream_tenant_timelines<'a>(
+    s3_client: &'a Client,
+    target: &'a RootTarget,
+    tenant: TenantId,
+) -> anyhow::Result<impl Stream<Item = Result<TenantTimelineId, anyhow::Error>> + 'a> {
+    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
+    let mut continuation_token = None;
+    let timelines_target = target.timelines_root(&tenant);
+
+    loop {
+        tracing::info!("Listing in {}", tenant);
+        let fetch_response =
+            list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone())
+                .await;
+        let fetch_response = match fetch_response {
+            Err(e) => {
+                timeline_ids.push(Err(e));
+                break;
+            }
+            Ok(r) => r,
+        };
+
+        let new_entry_ids = fetch_response
+            .common_prefixes()
+            .unwrap_or_default()
+            .iter()
+            .filter_map(|prefix| prefix.prefix())
+            .filter_map(|prefix| -> Option<&str> {
+                prefix
+                    .strip_prefix(&timelines_target.prefix_in_bucket)?
+                    .strip_suffix('/')
+            })
+            .map(|entry_id_str| {
+                entry_id_str
+                    .parse::<TimelineId>()
+                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+            });
+
+        for i in new_entry_ids {
+            timeline_ids.push(i);
+        }
+
+        match fetch_response.next_continuation_token {
+            Some(new_token) => continuation_token = Some(new_token),
+            None => break,
+        }
+    }
+
+    tracing::info!("Yielding for {}", tenant);
+    Ok(stream! {
+        for i in timeline_ids {
+            let id = i?;
+            yield Ok(TenantTimelineId::new(tenant, id));
+        }
+    })
+}
--- a/s3_scrubber/src/s3_deletion.rs
+++ b/s3_scrubber/src/s3_deletion.rs
@@ -0,0 +1,434 @@
+use std::collections::BTreeMap;
+use std::num::NonZeroUsize;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::Context;
+use aws_sdk_s3::types::{Delete, ObjectIdentifier};
+use aws_sdk_s3::Client;
+use tokio::sync::mpsc::error::TryRecvError;
+use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::Mutex;
+use tokio::task::JoinSet;
+use tracing::{debug, error, info, info_span, Instrument};
+
+use crate::delete_batch_producer::DeleteBatch;
+use crate::{list_objects_with_retries, RootTarget, S3Target, TenantId, MAX_RETRIES};
+use utils::id::TenantTimelineId;
+
+pub struct S3Deleter {
+    dry_run: bool,
+    concurrent_tasks_count: NonZeroUsize,
+    delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
+    s3_client: Arc<Client>,
+    s3_target: RootTarget,
+}
+
+impl S3Deleter {
+    pub fn new(
+        dry_run: bool,
+        concurrent_tasks_count: NonZeroUsize,
+        s3_client: Arc<Client>,
+        delete_batch_receiver: Arc<Mutex<UnboundedReceiver<DeleteBatch>>>,
+        s3_target: RootTarget,
+    ) -> Self {
+        Self {
+            dry_run,
+            concurrent_tasks_count,
+            delete_batch_receiver,
+            s3_client,
+            s3_target,
+        }
+    }
+
+    pub async fn remove_all(self) -> anyhow::Result<DeletionStats> {
+        let mut deletion_tasks = JoinSet::new();
+        for id in 0..self.concurrent_tasks_count.get() {
+            let closure_client = Arc::clone(&self.s3_client);
+            let closure_s3_target = self.s3_target.clone();
+            let closure_batch_receiver = Arc::clone(&self.delete_batch_receiver);
+            let dry_run = self.dry_run;
+            deletion_tasks.spawn(
+                async move {
+                    info!("Task started");
+                    (
+                        id,
+                        async move {
+                            let mut task_stats = DeletionStats::default();
+                            loop {
+                                let mut guard = closure_batch_receiver.lock().await;
+                                let receiver_result = guard.try_recv();
+                                drop(guard);
+                                match receiver_result {
+                                    Ok(batch) => {
+                                        let stats = delete_batch(
+                                            &closure_client,
+                                            &closure_s3_target,
+                                            batch,
+                                            dry_run,
+                                        )
+                                        .await
+                                        .context("batch deletion")?;
+                                        debug!(
+                                            "Batch processed, number of objects deleted per tenant in the batch is: {}, per timeline — {}",
+                                            stats.deleted_tenant_keys.len(),
+                                            stats.deleted_timeline_keys.len(),
+                                        );
+                                        task_stats.merge(stats);
+                                    }
+                                    Err(TryRecvError::Empty) => {
+                                        debug!("No tasks yet, waiting");
+                                        tokio::time::sleep(Duration::from_secs(1)).await;
+                                        continue;
+                                    }
+                                    Err(TryRecvError::Disconnected) => {
+                                        info!("Task finished: sender dropped");
+                                        return Ok(task_stats);
+                                    }
+                                }
+                            }
+                        }
+                        .in_current_span()
+                        .await,
+                    )
+                }
+                .instrument(info_span!("deletion_task", %id)),
+            );
+        }
+
+        let mut total_stats = DeletionStats::default();
+        while let Some(task_result) = deletion_tasks.join_next().await {
+            match task_result {
+                Ok((id, Ok(task_stats))) => {
+                    info!("Task {id} completed");
+                    total_stats.merge(task_stats);
+                }
+                Ok((id, Err(e))) => {
+                    error!("Task {id} failed: {e:#}");
+                    return Err(e);
+                }
+                Err(join_error) => anyhow::bail!("Failed to join on a task: {join_error:?}"),
+            }
+        }
+
+        Ok(total_stats)
+    }
+}
+
+/// S3 delete_objects allows up to 1000 keys to be passed in a single request.
+/// Yet if you pass too many key requests, apparently S3 could return with OK and
+/// actually delete nothing, so keep the number lower.
+const MAX_ITEMS_TO_DELETE: usize = 200;
+
+#[derive(Debug, Default)]
+pub struct DeletionStats {
+    pub deleted_tenant_keys: BTreeMap<TenantId, usize>,
+    pub deleted_timeline_keys: BTreeMap<TenantTimelineId, usize>,
+}
+
+impl DeletionStats {
+    fn merge(&mut self, other: Self) {
+        self.deleted_tenant_keys.extend(other.deleted_tenant_keys);
+        self.deleted_timeline_keys
+            .extend(other.deleted_timeline_keys);
+    }
+}
+
+async fn delete_batch(
+    s3_client: &Client,
+    s3_target: &RootTarget,
+    batch: DeleteBatch,
+    dry_run: bool,
+) -> anyhow::Result<DeletionStats> {
+    let (deleted_tenant_keys, deleted_timeline_keys) = tokio::join!(
+        delete_tenants_batch(batch.tenants, s3_target, s3_client, dry_run),
+        delete_timelines_batch(batch.timelines, s3_target, s3_client, dry_run),
+    );
+
+    Ok(DeletionStats {
+        deleted_tenant_keys: deleted_tenant_keys.context("tenant batch deletion")?,
+        deleted_timeline_keys: deleted_timeline_keys.context("timeline batch deletion")?,
+    })
+}
+
+async fn delete_tenants_batch(
+    batched_tenants: Vec<TenantId>,
+    s3_target: &RootTarget,
+    s3_client: &Client,
+    dry_run: bool,
+) -> Result<BTreeMap<TenantId, usize>, anyhow::Error> {
+    info!("Deleting tenants batch of size {}", batched_tenants.len());
+    info!("Tenant ids to remove: {batched_tenants:?}");
+    let deleted_keys = delete_elements(
+        &batched_tenants,
+        s3_target,
+        s3_client,
+        dry_run,
+        |root_target, tenant_to_delete| root_target.tenant_root(&tenant_to_delete),
+    )
+    .await?;
+
+    if !dry_run {
+        let mut last_err = None;
+        for _ in 0..MAX_RETRIES {
+            match ensure_tenant_batch_deleted(s3_client, s3_target, &batched_tenants).await {
+                Ok(()) => {
+                    last_err = None;
+                    break;
+                }
+                Err(e) => {
+                    error!("Failed to ensure the tenant batch is deleted: {e}");
+                    last_err = Some(e);
+                }
+            }
+        }
+
+        if let Some(e) = last_err {
+            anyhow::bail!(
+                "Failed to ensure that tenant batch is deleted {MAX_RETRIES} times: {e:?}"
+            );
+        }
+    }
+
+    Ok(deleted_keys)
+}
+
+async fn delete_timelines_batch(
+    batched_timelines: Vec<TenantTimelineId>,
+    s3_target: &RootTarget,
+    s3_client: &Client,
+    dry_run: bool,
+) -> Result<BTreeMap<TenantTimelineId, usize>, anyhow::Error> {
+    info!(
+        "Deleting timelines batch of size {}",
+        batched_timelines.len()
+    );
+    info!(
+        "Timeline ids to remove: {:?}",
+        batched_timelines
+            .iter()
+            .map(|id| id.to_string())
+            .collect::<Vec<_>>()
+    );
+    let deleted_keys = delete_elements(
+        &batched_timelines,
+        s3_target,
+        s3_client,
+        dry_run,
+        |root_target, timeline_to_delete| root_target.timeline_root(&timeline_to_delete),
+    )
+    .await?;
+
+    if !dry_run {
+        let mut last_err = None;
+        for _ in 0..MAX_RETRIES {
+            match ensure_timeline_batch_deleted(s3_client, s3_target, &batched_timelines).await {
+                Ok(()) => {
+                    last_err = None;
+                    break;
+                }
+                Err(e) => {
+                    error!("Failed to ensure the timelines batch is deleted: {e}");
+                    last_err = Some(e);
+                }
+            }
+        }
+
+        if let Some(e) = last_err {
+            anyhow::bail!(
+                "Failed to ensure that timeline batch is deleted {MAX_RETRIES} times: {e:?}"
+            );
+        }
+    }
+    Ok(deleted_keys)
+}
+
+async fn delete_elements<I>(
+    batched_ids: &Vec<I>,
+    s3_target: &RootTarget,
+    s3_client: &Client,
+    dry_run: bool,
+    target_producer: impl Fn(&RootTarget, I) -> S3Target,
+) -> Result<BTreeMap<I, usize>, anyhow::Error>
+where
+    I: Ord + PartialOrd + Copy,
+{
+    let mut deleted_keys = BTreeMap::new();
+    let mut object_ids_to_delete = Vec::with_capacity(MAX_ITEMS_TO_DELETE);
+    for &id_to_delete in batched_ids {
+        let mut continuation_token = None;
+        let mut subtargets = vec![target_producer(s3_target, id_to_delete)];
+        while let Some(current_target) = subtargets.pop() {
+            loop {
+                let fetch_response = list_objects_with_retries(
+                    s3_client,
+                    &current_target,
+                    continuation_token.clone(),
+                )
+                .await?;
+
+                for object_id in fetch_response
+                    .contents()
+                    .unwrap_or_default()
+                    .iter()
+                    .filter_map(|object| object.key())
+                    .map(|key| ObjectIdentifier::builder().key(key).build())
+                {
+                    if object_ids_to_delete.len() >= MAX_ITEMS_TO_DELETE {
+                        let object_ids_for_request = std::mem::replace(
+                            &mut object_ids_to_delete,
+                            Vec::with_capacity(MAX_ITEMS_TO_DELETE),
+                        );
+                        send_delete_request(
+                            s3_client,
+                            s3_target.bucket_name(),
+                            object_ids_for_request,
+                            dry_run,
+                        )
+                        .await
+                        .context("object ids deletion")?;
+                    }
+
+                    object_ids_to_delete.push(object_id);
+                    *deleted_keys.entry(id_to_delete).or_default() += 1;
+                }
+
+                subtargets.extend(
+                    fetch_response
+                        .common_prefixes()
+                        .unwrap_or_default()
+                        .iter()
+                        .filter_map(|common_prefix| common_prefix.prefix())
+                        .map(|prefix| {
+                            let mut new_target = current_target.clone();
+                            new_target.prefix_in_bucket = prefix.to_string();
+                            new_target
+                        }),
+                );
+
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
+            }
+        }
+    }
+    if !object_ids_to_delete.is_empty() {
+        info!("Removing last objects of the batch");
+        send_delete_request(
+            s3_client,
+            s3_target.bucket_name(),
+            object_ids_to_delete,
+            dry_run,
+        )
+        .await
+        .context("Last object ids deletion")?;
+    }
+    Ok(deleted_keys)
+}
+
+pub async fn send_delete_request(
+    s3_client: &Client,
+    bucket_name: &str,
+    ids: Vec<ObjectIdentifier>,
+    dry_run: bool,
+) -> anyhow::Result<()> {
+    info!("Removing {} object ids from S3", ids.len());
+    info!("Object ids to remove: {ids:?}");
+    let delete_request = s3_client
+        .delete_objects()
+        .bucket(bucket_name)
+        .delete(Delete::builder().set_objects(Some(ids)).build());
+    if dry_run {
+        info!("Dry run, skipping the actual removal");
+        Ok(())
+    } else {
+        let original_request = delete_request.clone();
+
+        for _ in 0..MAX_RETRIES {
+            match delete_request
+                .clone()
+                .send()
+                .await
+                .context("delete request processing")
+            {
+                Ok(delete_response) => {
+                    info!("Delete response: {delete_response:?}");
+                    match delete_response.errors() {
+                        Some(delete_errors) => {
+                            error!("Delete request returned errors: {delete_errors:?}");
+                            tokio::time::sleep(Duration::from_secs(1)).await;
+                        }
+                        None => {
+                            info!("Successfully removed an object batch from S3");
+                            return Ok(());
+                        }
+                    }
+                }
+                Err(e) => {
+                    error!("Failed to send a delete request: {e:#}");
+                    tokio::time::sleep(Duration::from_secs(1)).await;
+                }
+            }
+        }
+
+        error!("Failed to do deletion, request: {original_request:?}");
+        anyhow::bail!("Failed to run deletion request {MAX_RETRIES} times");
+    }
+}
+
+async fn ensure_tenant_batch_deleted(
+    s3_client: &Client,
+    s3_target: &RootTarget,
+    batch: &[TenantId],
+) -> anyhow::Result<()> {
+    let mut not_deleted_tenants = Vec::with_capacity(batch.len());
+
+    for &tenant_id in batch {
+        let fetch_response =
+            list_objects_with_retries(s3_client, &s3_target.tenant_root(&tenant_id), None).await?;
+
+        if fetch_response.is_truncated()
+            || fetch_response.contents().is_some()
+            || fetch_response.common_prefixes().is_some()
+        {
+            error!(
+                "Tenant {tenant_id} should be deleted, but its list response is {fetch_response:?}"
+            );
+            not_deleted_tenants.push(tenant_id);
+        }
+    }
+
+    anyhow::ensure!(
+        not_deleted_tenants.is_empty(),
+        "Failed to delete all tenants in a batch. Tenants {not_deleted_tenants:?} should be deleted."
+    );
+    Ok(())
+}
+
+async fn ensure_timeline_batch_deleted(
+    s3_client: &Client,
+    s3_target: &RootTarget,
+    batch: &[TenantTimelineId],
+) -> anyhow::Result<()> {
+    let mut not_deleted_timelines = Vec::with_capacity(batch.len());
+
+    for &id in batch {
+        let fetch_response =
+            list_objects_with_retries(s3_client, &s3_target.timeline_root(&id), None).await?;
+
+        if fetch_response.is_truncated()
+            || fetch_response.contents().is_some()
+            || fetch_response.common_prefixes().is_some()
+        {
+            error!("Timeline {id} should be deleted, but its list response is {fetch_response:?}");
+            not_deleted_timelines.push(id);
+        }
+    }
+
+    anyhow::ensure!(
+        not_deleted_timelines.is_empty(),
+        "Failed to delete all timelines in a batch"
+    );
+    Ok(())
+}
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -0,0 +1,234 @@
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+use crate::checks::{
+    branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
+    TimelineAnalysis,
+};
+use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
+use crate::{init_logging, init_s3_client, BucketConfig, RootTarget, S3Target, CLI_NAME};
+use aws_sdk_s3::Client;
+use aws_types::region::Region;
+use futures_util::{pin_mut, StreamExt, TryStreamExt};
+use histogram::Histogram;
+use pageserver::tenant::{IndexPart, TENANTS_SEGMENT_NAME};
+use utils::id::TenantTimelineId;
+
+pub struct MetadataSummary {
+    count: usize,
+    with_errors: HashSet<TenantTimelineId>,
+    with_warnings: HashSet<TenantTimelineId>,
+    with_garbage: HashSet<TenantTimelineId>,
+    indices_by_version: HashMap<usize, usize>,
+
+    layer_count: MinMaxHisto,
+    timeline_size_bytes: MinMaxHisto,
+    layer_size_bytes: MinMaxHisto,
+}
+
+/// A histogram plus minimum and maximum tracking
+struct MinMaxHisto {
+    histo: Histogram,
+    min: u64,
+    max: u64,
+}
+
+impl MinMaxHisto {
+    fn new() -> Self {
+        Self {
+            histo: histogram::Histogram::builder()
+                .build()
+                .expect("Bad histogram params"),
+            min: u64::MAX,
+            max: 0,
+        }
+    }
+
+    fn sample(&mut self, v: u64) -> Result<(), histogram::Error> {
+        self.min = std::cmp::min(self.min, v);
+        self.max = std::cmp::max(self.max, v);
+        let r = self.histo.increment(v, 1);
+
+        if r.is_err() {
+            tracing::warn!("Bad histogram sample: {v}");
+        }
+
+        r
+    }
+
+    fn oneline(&self) -> String {
+        let percentiles = match self.histo.percentiles(&[1.0, 10.0, 50.0, 90.0, 99.0]) {
+            Ok(p) => p,
+            Err(e) => return format!("No data: {}", e),
+        };
+
+        let percentiles: Vec<u64> = percentiles
+            .iter()
+            .map(|p| p.bucket().low() + p.bucket().high() / 2)
+            .collect();
+
+        format!(
+            "min {}, 1% {}, 10% {}, 50% {}, 90% {}, 99% {}, max {}",
+            self.min,
+            percentiles[0],
+            percentiles[1],
+            percentiles[2],
+            percentiles[3],
+            percentiles[4],
+            self.max,
+        )
+    }
+}
+
+impl MetadataSummary {
+    fn new() -> Self {
+        Self {
+            count: 0,
+            with_errors: HashSet::new(),
+            with_warnings: HashSet::new(),
+            with_garbage: HashSet::new(),
+            indices_by_version: HashMap::new(),
+            layer_count: MinMaxHisto::new(),
+            timeline_size_bytes: MinMaxHisto::new(),
+            layer_size_bytes: MinMaxHisto::new(),
+        }
+    }
+
+    fn update_histograms(&mut self, index_part: &IndexPart) -> Result<(), histogram::Error> {
+        self.layer_count
+            .sample(index_part.layer_metadata.len() as u64)?;
+        let mut total_size: u64 = 0;
+        for meta in index_part.layer_metadata.values() {
+            total_size += meta.file_size;
+            self.layer_size_bytes.sample(meta.file_size)?;
+        }
+        self.timeline_size_bytes.sample(total_size)?;
+
+        Ok(())
+    }
+
+    fn update_data(&mut self, data: &S3TimelineBlobData) {
+        self.count += 1;
+        if let BlobDataParseResult::Parsed {
+            index_part,
+            s3_layers: _,
+        } = &data.blob_data
+        {
+            *self
+                .indices_by_version
+                .entry(index_part.get_version())
+                .or_insert(0) += 1;
+
+            if let Err(e) = self.update_histograms(index_part) {
+                // Value out of range?  Warn that the results are untrustworthy
+                tracing::warn!(
+                    "Error updating histograms, summary stats may be wrong: {}",
+                    e
+                );
+            }
+        }
+    }
+
+    fn update_analysis(&mut self, id: &TenantTimelineId, analysis: &TimelineAnalysis) {
+        if !analysis.errors.is_empty() {
+            self.with_errors.insert(*id);
+        }
+
+        if !analysis.warnings.is_empty() {
+            self.with_warnings.insert(*id);
+        }
+    }
+
+    /// Long-form output for printing at end of a scan
+    pub fn summary_string(&self) -> String {
+        let version_summary: String = itertools::join(
+            self.indices_by_version
+                .iter()
+                .map(|(k, v)| format!("{k}: {v}")),
+            ", ",
+        );
+
+        format!(
+            "Timelines: {0}
+With errors: {1}
+With warnings: {2}
+With garbage: {3}
+Index versions: {version_summary}
+Timeline size bytes: {4}
+Layer size bytes: {5}
+Timeline layer count: {6}
+",
+            self.count,
+            self.with_errors.len(),
+            self.with_warnings.len(),
+            self.with_garbage.len(),
+            self.timeline_size_bytes.oneline(),
+            self.layer_size_bytes.oneline(),
+            self.layer_count.oneline(),
+        )
+    }
+
+    pub fn is_fatal(&self) -> bool {
+        !self.with_errors.is_empty()
+    }
+}
+
+/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
+pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
+    let file_name = format!(
+        "{}_scan_metadata_{}_{}.log",
+        CLI_NAME,
+        bucket_config.bucket,
+        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
+    );
+
+    let _guard = init_logging(&file_name);
+
+    let s3_client = Arc::new(init_s3_client(
+        bucket_config.sso_account_id,
+        Region::new(bucket_config.region),
+    ));
+    let delimiter = "/";
+    let target = RootTarget::Pageserver(S3Target {
+        bucket_name: bucket_config.bucket.to_string(),
+        prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(delimiter),
+        delimiter: delimiter.to_string(),
+    });
+
+    let tenants = stream_tenants(&s3_client, &target);
+
+    // How many tenants to process in parallel.  We need to be mindful of pageservers
+    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
+    const CONCURRENCY: usize = 32;
+
+    // Generate a stream of TenantTimelineId
+    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
+    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_flatten();
+
+    // Generate a stream of S3TimelineBlobData
+    async fn report_on_timeline(
+        s3_client: &Client,
+        target: &RootTarget,
+        ttid: TenantTimelineId,
+    ) -> anyhow::Result<(TenantTimelineId, S3TimelineBlobData)> {
+        let data = list_timeline_blobs(s3_client, ttid, target).await?;
+        Ok((ttid, data))
+    }
+    let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
+    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+
+    let mut summary = MetadataSummary::new();
+    pin_mut!(timelines);
+    while let Some(i) = timelines.next().await {
+        let (ttid, data) = i?;
+        summary.update_data(&data);
+
+        let analysis =
+            branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data)).await;
+
+        summary.update_analysis(&ttid, &analysis);
+    }
+
+    Ok(summary)
+}
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -18,6 +18,10 @@
 //           reportUrl: "...",
 //           reportJsonUrl: "...",
 //         },
+//         coverage: {
+//           coverageUrl: "...",
+//           summaryJsonUrl: "...",
+//         }
 //       })
 //

@@ -183,7 +187,24 @@ const reportSummary = async (params) => {
    return summary
 }

-module.exports = async ({ github, context, fetch, report }) => {
+const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => {
+    let summary = `### Code coverage [full report](${coverageUrl})\n`
+
+    const coverage = await (await fetch(summaryJsonUrl)).json()
+    for (const covType of Object.keys(coverage).sort()) {
+        if (!coverage.hasOwnProperty(covType)) {
+            continue
+        }
+
+        summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n`
+    }
+
+    summary += `\n___\n`
+
+    return summary
+}
+
+module.exports = async ({ github, context, fetch, report, coverage }) => {
    // Marker to find the comment in the subsequent runs
    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
    // If we run the script in the PR or in the branch (main/release/...)
@@ -204,7 +225,6 @@ module.exports = async ({ github, context, fetch, report }) => {
    }

    const {reportUrl, reportJsonUrl} = report
-
    if (reportUrl && reportJsonUrl) {
        try {
            const parsed = await parseReportJson({ reportJsonUrl, fetch })
@@ -223,6 +243,22 @@ module.exports = async ({ github, context, fetch, report }) => {
    } else {
        commentBody += `#### No tests were run or test report is not available\n`
    }
+
+    const { coverageUrl, summaryJsonUrl } = coverage
+    if (coverageUrl && summaryJsonUrl) {
+        try {
+            commentBody += await parseCoverageSummary({ summaryJsonUrl, coverageUrl, fetch })
+        } catch (error) {
+            commentBody += `### [full report](${coverageUrl})\n___\n`
+            commentBody += `#### Failed to create a coverage summary for the test run: \n`
+            commentBody += "```\n"
+            commentBody += `${error.stack}\n`
+            commentBody += "```\n"
+        }
+    } else {
+        commentBody += `#### Test coverage report is not avaibale\n`
+    }
+
    commentBody += autoupdateNotice

    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -414,6 +414,8 @@ class NeonEnvBuilder:
        neon_binpath: Path,
        pg_distrib_dir: Path,
        pg_version: PgVersion,
+        test_name: str,
+        test_output_dir: Path,
        remote_storage: Optional[RemoteStorage] = None,
        remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
        pageserver_config_override: Optional[str] = None,
@@ -428,7 +430,6 @@ class NeonEnvBuilder:
        preserve_database_files: bool = False,
        initial_tenant: Optional[TenantId] = None,
        initial_timeline: Optional[TimelineId] = None,
-        enable_generations: bool = False,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -456,6 +457,13 @@ class NeonEnvBuilder:
        self.initial_tenant = initial_tenant or TenantId.generate()
        self.initial_timeline = initial_timeline or TimelineId.generate()
        self.enable_generations = False
+        self.scrub_on_exit = False
+        self.test_output_dir = test_output_dir
+
+        assert test_name.startswith(
+            "test_"
+        ), "Unexpectedly instantiated from outside a test function"
+        self.test_name = test_name

    def init_configs(self) -> NeonEnv:
        # Cannot create more than one environment from one builder
@@ -485,26 +493,44 @@ class NeonEnvBuilder:

        return env

+    def enable_scrub_on_exit(self):
+        """
+        Call this if you would like the fixture to automatically run
+        s3_scrubber at the end of the test, as a bidirectional test
+        that the scrubber is working properly, and that the code within
+        the test didn't produce any invalid remote state.
+        """
+
+        if not isinstance(self.remote_storage, S3Storage):
+            # The scrubber can't talk to e.g. LocalFS -- it needs
+            # an HTTP endpoint (mock is fine) to connect to.
+            raise RuntimeError(
+                "Cannot scrub with remote_storage={self.remote_storage}, require an S3 endpoint"
+            )
+
+        self.scrub_on_exit = True
+
    def enable_remote_storage(
        self,
        remote_storage_kind: RemoteStorageKind,
-        test_name: str,
        force_enable: bool = True,
        enable_remote_extensions: bool = False,
    ):
+        bucket_name = re.sub(r"[_\[\]]", "-", self.test_name)[:63]
+
        if remote_storage_kind == RemoteStorageKind.NOOP:
            return
        elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
            self.enable_local_fs_remote_storage(force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
            self.enable_mock_s3_remote_storage(
-                bucket_name=test_name,
+                bucket_name=bucket_name,
                force_enable=force_enable,
                enable_remote_extensions=enable_remote_extensions,
            )
        elif remote_storage_kind == RemoteStorageKind.REAL_S3:
            self.enable_real_s3_remote_storage(
-                test_name=test_name,
+                test_name=bucket_name,
                force_enable=force_enable,
                enable_remote_extensions=enable_remote_extensions,
            )
@@ -719,11 +745,20 @@ class NeonEnvBuilder:
                self.env.attachment_service.stop(immediate=True)

            cleanup_error = None
+
+            if self.scrub_on_exit:
+                try:
+                    S3Scrubber(self.test_output_dir, self).scan_metadata()
+                except Exception as e:
+                    log.error(f"Error during remote storage scrub: {e}")
+                    cleanup_error = e
+
            try:
                self.cleanup_remote_storage()
            except Exception as e:
                log.error(f"Error during remote storage cleanup: {e}")
-                cleanup_error = e
+                if cleanup_error is not None:
+                    cleanup_error = e

            try:
                self.cleanup_local_storage()
@@ -947,6 +982,7 @@ def _shared_simple_env(
    default_broker: NeonBroker,
    run_id: uuid.UUID,
    top_output_dir: Path,
+    test_output_dir: Path,
    neon_binpath: Path,
    pg_distrib_dir: Path,
    pg_version: PgVersion,
@@ -974,6 +1010,8 @@ def _shared_simple_env(
        pg_version=pg_version,
        run_id=run_id,
        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        test_name=request.node.name,
+        test_output_dir=test_output_dir,
    ) as builder:
        env = builder.init_start()

@@ -1001,7 +1039,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
@pytest.fixture(scope="function")
 def neon_env_builder(
    pytestconfig: Config,
-    test_output_dir: str,
+    test_output_dir: Path,
    port_distributor: PortDistributor,
    mock_s3_server: MockS3Server,
    neon_binpath: Path,
@@ -1009,6 +1047,7 @@ def neon_env_builder(
    pg_version: PgVersion,
    default_broker: NeonBroker,
    run_id: uuid.UUID,
+    request: FixtureRequest,
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1037,6 +1076,8 @@ def neon_env_builder(
        broker=default_broker,
        run_id=run_id,
        preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        test_name=request.node.name,
+        test_output_dir=test_output_dir,
    ) as builder:
        yield builder

@@ -1508,8 +1549,10 @@ class ComputeCtl(AbstractNeonCli):
 class NeonAttachmentService:
    def __init__(self, env: NeonEnv):
        self.env = env
+        self.running = False

    def start(self):
+        assert not self.running
        self.env.neon_cli.attachment_service_start()
        self.running = True
        return self
@@ -1800,7 +1843,10 @@ class PgBin:
        self._fixpath(command)
        log.info(f"Running command '{' '.join(command)}'")
        env = self._build_env(env)
-        return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs)
+        base_path, _, _ = subprocess_capture(
+            self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs
+        )
+        return base_path


@pytest.fixture(scope="function")
@@ -2806,6 +2852,41 @@ class SafekeeperHttpClient(requests.Session):
        return metrics


+class S3Scrubber:
+    def __init__(self, log_dir: Path, env: NeonEnvBuilder):
+        self.env = env
+        self.log_dir = log_dir
+
+    def scrubber_cli(self, args, timeout):
+        assert isinstance(self.env.remote_storage, S3Storage)
+        s3_storage = self.env.remote_storage
+
+        env = {
+            "REGION": s3_storage.bucket_region,
+            "BUCKET": s3_storage.bucket_name,
+        }
+        env.update(s3_storage.access_env_vars())
+
+        if s3_storage.endpoint is not None:
+            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
+
+        base_args = [self.env.neon_binpath / "s3_scrubber"]
+        args = base_args + args
+
+        (output_path, _, status_code) = subprocess_capture(
+            self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
+        )
+        if status_code:
+            log.warning(f"Scrub command {args} failed")
+            log.warning(f"Scrub environment: {env}")
+            log.warning(f"Output at: {output_path}")
+
+            raise RuntimeError("Remote storage scrub failed")
+
+    def scan_metadata(self):
+        self.scrubber_cli(["scan-metadata"], timeout=30)
+
+
 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    """Compute the working directory for an individual test."""
    test_name = request.node.name
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -620,8 +620,3 @@ class PageserverHttpClient(requests.Session):
            },
        )
        self.verbose_error(res)
-
-    def deletion_queue_flush(self, execute: bool = False):
-        self.put(
-            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
-        ).raise_for_status()
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -88,6 +88,19 @@ def available_s3_storages() -> List[RemoteStorageKind]:
    return remote_storages


+def s3_storage() -> RemoteStorageKind:
+    """
+    For tests that require a remote storage impl that exposes an S3
+    endpoint, but don't want to parametrize over multiple storage types.
+
+    Use real S3 if available, else use MockS3
+    """
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        return RemoteStorageKind.REAL_S3
+    else:
+        return RemoteStorageKind.MOCK_S3
+
+
@dataclass
 class LocalFsStorage:
    root: Path
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,9 +4,10 @@ import os
 import re
 import subprocess
 import tarfile
+import threading
 import time
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, TypeVar
 from urllib.parse import urlencode

 import allure
@@ -26,34 +27,100 @@ def get_self_dir() -> Path:
    return Path(__file__).resolve().parent


-def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str:
-    """Run a process and capture its output
+def subprocess_capture(
+    capture_dir: Path,
+    cmd: List[str],
+    *,
+    check=False,
+    echo_stderr=False,
+    echo_stdout=False,
+    capture_stdout=False,
+    **kwargs: Any,
+) -> Tuple[str, Optional[str], int]:
+    """Run a process and bifurcate its output to files and the `log` logger

-    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
+    stderr and stdout are always captured in files.  They are also optionally
+    echoed to the log (echo_stderr, echo_stdout), and/or captured and returned
+    (capture_stdout).
+
+    File output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
    where "cmd" is the name of the program and NNN is an incrementing
    counter.

    If those files already exist, we will overwrite them.
-    Returns basepath for files with captured output.
+
+    Returns 3-tuple of:
+     - The base path for output files
+     - Captured stdout, or None
+     - The exit status of the process
    """
    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
+    base_cmd = os.path.basename(cmd[0])
+    base = f"{base_cmd}_{global_counter()}"
    basepath = os.path.join(capture_dir, base)
    stdout_filename = f"{basepath}.stdout"
    stderr_filename = f"{basepath}.stderr"

+    # Since we will stream stdout and stderr concurrently, need to do it in a thread.
+    class OutputHandler(threading.Thread):
+        def __init__(self, in_file, out_file, echo: bool, capture: bool):
+            super().__init__()
+            self.in_file = in_file
+            self.out_file = out_file
+            self.echo = echo
+            self.capture = capture
+            self.captured = ""
+
+        def run(self):
+            for line in self.in_file:
+                # Only bother decoding if we are going to do something more than stream to a file
+                if self.echo or self.capture:
+                    string = line.decode(encoding="utf-8", errors="replace")
+
+                    if self.echo:
+                        log.info(string)
+
+                    if self.capture:
+                        self.captured += string
+
+                self.out_file.write(line)
+
+    captured = None
    try:
-        with open(stdout_filename, "w") as stdout_f:
-            with open(stderr_filename, "w") as stderr_f:
+        with open(stdout_filename, "wb") as stdout_f:
+            with open(stderr_filename, "wb") as stderr_f:
                log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"')
-                subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
+
+                p = subprocess.Popen(
+                    cmd,
+                    **kwargs,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                stdout_handler = OutputHandler(
+                    p.stdout, stdout_f, echo=echo_stdout, capture=capture_stdout
+                )
+                stdout_handler.start()
+                stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
+                stderr_handler.start()
+
+                r = p.wait()
+
+                stdout_handler.join()
+                stderr_handler.join()
+
+                if check and r != 0:
+                    raise subprocess.CalledProcessError(r, " ".join(cmd))
+
+                if capture_stdout:
+                    captured = stdout_handler.captured
    finally:
        # Remove empty files if there is no output
        for filename in (stdout_filename, stderr_filename):
            if os.stat(filename).st_size == 0:
                os.remove(filename)

-    return basepath
+    return (basepath, captured, r)


 _global_counter = 0
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -1,5 +1,6 @@
+import os
 from dataclasses import dataclass
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple

 import pytest
 from _pytest.mark import ParameterSet
@@ -78,6 +79,15 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 )


+def get_scale() -> List[str]:
+    # We parametrize each tpc-h and clickbench test with scale
+    # to distinguish them from each other, but don't really use it inside.
+    # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable.
+
+    scale = os.getenv("TEST_OLAP_SCALE", "noscale")
+    return [scale]
+
+
 def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
    # prepare connstr:
    # - cut out password from connstr to pass it via env
@@ -100,9 +110,10 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N
            env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)


+@pytest.mark.parametrize("scale", get_scale())
@pytest.mark.parametrize("query", QUERIES)
@pytest.mark.remote_cluster
-def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare):
+def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale: str):
    """
    An OLAP-style ClickHouse benchmark

@@ -128,9 +139,10 @@ def tpch_queuies() -> Tuple[ParameterSet, ...]:
    )


+@pytest.mark.parametrize("scale", get_scale())
@pytest.mark.parametrize("query", tpch_queuies())
@pytest.mark.remote_cluster
-def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare):
+def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare, scale: str):
    """
    TCP-H Benchmark

--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -16,7 +16,6 @@ from fixtures.utils import wait_until
 def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
-        test_name="test_attach_tenant_config",
    )
    env = neon_env_builder.init_start()

@@ -39,7 +38,6 @@ class NegativeTests:
 def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]:
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
-        test_name="test_attach_tenant_config",
    )
    env = neon_env_builder.init_start()
    assert isinstance(env.remote_storage, LocalFsStorage)
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -135,7 +135,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev

    log.info(f"setting up eviction_env for test {request.node.name}")

-    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
+    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS)

    # initial tenant will not be present on this pageserver
    env = neon_env_builder.init_configs()
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -90,7 +90,6 @@ def test_remote_extensions(
 ):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_extensions",
        enable_remote_extensions=True,
    )
    env = neon_env_builder.init_start()
@@ -157,7 +156,6 @@ def test_remote_library(
 ):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_library",
        enable_remote_extensions=True,
    )
    env = neon_env_builder.init_start()
@@ -218,7 +216,6 @@ def test_multiple_extensions_one_archive(
 ):
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=RemoteStorageKind.REAL_S3,
-        test_name="test_multiple_extensions_one_archive",
        enable_remote_extensions=True,
    )
    env = neon_env_builder.init_start()
@@ -266,7 +263,6 @@ def test_extension_download_after_restart(

    neon_env_builder.enable_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
-        test_name="test_extension_download_after_restart",
        enable_remote_extensions=True,
    )
    env = neon_env_builder.init_start()
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -102,7 +102,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:

    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
-        test_name="test_gc_index_upload",
    )

    env = neon_env_builder.init_start()
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`-bash: scripts/pytest: No such file or directory`