Introduce a maximum size for values and warn for them

2026-01-30 16:50:37 +00:00 · 2024-02-06 01:06:27 +01:00
195 changed files with 3486 additions and 6299 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,12 +179,6 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

-    - name: Cache poetry deps
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
-
    - name: Store Allure test stat in the DB (new)
      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
      shell: bash -euxo pipefail {0}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,10 +44,6 @@ inputs:
    description: 'Postgres version to use for tests'
    required: false
    default: 'v14'
-  benchmark_durations:
-    description: 'benchmark durations JSON'
-    required: false
-    default: '{}'

 runs:
  using: "composite"
@@ -86,10 +82,11 @@ runs:
        fetch-depth: 1

    - name: Cache poetry deps
+      id: cache_poetry
      uses: actions/cache@v3
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
@@ -163,7 +160,7 @@ runs:
        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
          mkdir -p $TEST_OUTPUT
-          echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
+          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"

          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -93,7 +93,6 @@ jobs:
                                                       --body-file "body.md" \
                                                       --head "${BRANCH}" \
                                                       --base "main" \
-                                                       --label "run-e2e-tests-in-draft" \
                                                       --draft
          fi

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -22,7 +22,7 @@ env:
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
-  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

 jobs:
  check-permissions:
@@ -112,10 +112,11 @@ jobs:
          fetch-depth: 1

      - name: Cache poetry deps
+        id: cache_poetry
        uses: actions/cache@v3
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -131,7 +132,7 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
      options: --init
@@ -477,40 +478,8 @@ jobs:
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

-  get-benchmarks-durations:
-    outputs:
-      json: ${{ steps.get-benchmark-durations.outputs.json }}
-    needs: [ check-permissions, build-buildtools-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Cache poetry deps
-        uses: actions/cache@v3
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: get benchmark durations
-        id: get-benchmark-durations
-        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-        run: |
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
-                                                      --days 10 \
-                                                      --output /tmp/benchmark_durations.json
-          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
-
  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
+    needs: [ check-permissions, build-neon, build-buildtools-image ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
@@ -521,7 +490,7 @@ jobs:
      fail-fast: false
      matrix:
        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
-        pytest_split_group: [ 1, 2, 3, 4, 5 ]
+        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
@@ -534,8 +503,7 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
-          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -692,10 +660,50 @@ jobs:
            })

  trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
    needs: [ check-permissions, promote-images, tag ]
-    uses: ./.github/workflows/trigger-e2e-tests.yml
-    secrets: inherit
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
+    steps:
+      - name: Set PR's status to pending and request a remote CI test
+        run: |
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"

  neon-image:
    needs: [ check-permissions, build-buildtools-image, tag ]
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -38,10 +38,11 @@ jobs:
      uses: snok/install-poetry@v1

    - name: Cache poetry deps
+      id: cache_poetry
      uses: actions/cache@v3
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -1,118 +0,0 @@
-name: Trigger E2E Tests
-
-on:
-  pull_request:
-    types:
-      - ready_for_review
-  workflow_call:
-
-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-    
-env:
-  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
-  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-jobs:
-  cancel-previous-e2e-tests:
-    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Cancel previous e2e-tests runs for this PR
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          gh workflow --repo neondatabase/cloud \
-            run cancel-previous-in-concurrency-group.yml \
-              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
-
-  tag:
-    runs-on: [ ubuntu-latest ]
-    outputs:
-      build-tag: ${{ steps.build-tag.outputs.tag }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Get build tag
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-          CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }}
-          CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
-            echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
-          fi
-        id: build-tag
-
-  trigger-e2e-tests:
-    needs: [ tag ]
-    runs-on: [ self-hosted, gen3, small ]
-    env:
-      TAG: ${{ needs.tag.outputs.build-tag }}
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
-    steps:
-      - name: check if ecr image are present
-        run: |
-          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
-            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
-            if [ "$OUTPUT" == "" ]; then
-              echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
-              exit 1
-            fi
-          done
-
-      - name: Set PR's status to pending and request a remote CI test
-        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
-          # to place a job run status update later.
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
-
-          curl -f -X POST \
-          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"state\": \"pending\",
-              \"context\": \"neon-cloud-e2e\",
-              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-            }"
-
-          curl -f -X POST \
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-          -H "Accept: application/vnd.github.v3+json" \
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-          --data \
-            "{
-              \"ref\": \"main\",
-              \"inputs\": {
-                \"ci_job_name\": \"neon-cloud-e2e\",
-                \"commit_hash\": \"$COMMIT_SHA\",
-                \"remote_repo\": \"${{ github.repository }}\",
-                \"storage_image_tag\": \"${TAG}\",
-                \"compute_image_tag\": \"${TAG}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
-              }
-            }"
- 
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -54,9 +54,6 @@ _An instruction for maintainers_
 - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
    - Press the "Approve and run" button in GitHub UI
    - Add the `approved-for-ci-run` label to the PR
-    - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test
-      - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour)
-      - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors

 Repeat all steps after any change to the PR.
 - When the changes are ready to get merged — merge the original PR (not the internal one)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -281,7 +281,6 @@ dependencies = [
 "clap",
 "control_plane",
 "diesel",
- "diesel_migrations",
 "futures",
 "git-version",
 "hyper",
@@ -289,7 +288,6 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
- "r2d2",
 "reqwest",
 "serde",
 "serde_json",
@@ -1329,6 +1327,8 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "diesel",
+ "diesel_migrations",
 "futures",
 "git-version",
 "hex",
@@ -1650,7 +1650,6 @@ dependencies = [
 "diesel_derives",
 "itoa",
 "pq-sys",
- "r2d2",
 "serde_json",
 ]

@@ -2719,16 +2718,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "lasso"
-version = "0.7.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
-dependencies = [
- "dashmap",
- "hashbrown 0.13.2",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -2867,7 +2856,6 @@ dependencies = [
 "chrono",
 "libc",
 "once_cell",
- "procfs",
 "prometheus",
 "rand 0.8.5",
 "rand_distr",
@@ -3985,8 +3973,6 @@ checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
 dependencies = [
 "bitflags 1.3.2",
 "byteorder",
- "chrono",
- "flate2",
 "hex",
 "lazy_static",
 "rustix 0.36.16",
@@ -4077,7 +4063,6 @@ dependencies = [
 "clap",
 "consumption_metrics",
 "dashmap",
- "env_logger",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -4090,7 +4075,6 @@ dependencies = [
 "hyper-tungstenite",
 "ipnet",
 "itertools",
- "lasso",
 "md5",
 "metrics",
 "native-tls",
@@ -4107,7 +4091,6 @@ dependencies = [
 "pq_proto",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
 "rcgen",
 "redis",
 "regex",
@@ -4125,7 +4108,6 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
- "smallvec",
 "smol_str",
 "socket2 0.5.5",
 "sync_wrapper",
@@ -4144,7 +4126,6 @@ dependencies = [
 "tracing-subscriber",
 "tracing-utils",
 "url",
- "urlencoding",
 "utils",
 "uuid",
 "walkdir",
@@ -4172,17 +4153,6 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -4896,15 +4866,6 @@ dependencies = [
 "windows-sys 0.42.0",
 ]

-[[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
-dependencies = [
- "parking_lot 0.12.1",
-]
-
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -5740,7 +5701,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6265,7 +6226,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#d6a1c93442fb6b3a5bec490204961134e54925dc"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "io-uring",
 "libc",
@@ -6832,6 +6793,7 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "diesel",
 "either",
 "fail",
 "futures-channel",
@@ -6841,7 +6803,6 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
- "hashbrown 0.13.2",
 "hashbrown 0.14.0",
 "hex",
 "hmac",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,7 +95,6 @@ inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
-lasso = "0.7"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
@@ -113,7 +112,6 @@ parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
@@ -171,7 +169,6 @@ tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
-urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
--- a/5
+++ b/5
@@ -100,11 +100,6 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
       -c "listen_pg_addr='0.0.0.0:6400'" \
       -c "listen_http_addr='0.0.0.0:9898'"

-# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
-# that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH /usr/local/v16/lib
-
-
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -111,7 +111,7 @@ USER nonroot:nonroot
 WORKDIR /home/nonroot

 # Python
-ENV PYTHON_VERSION=3.9.18 \
+ENV PYTHON_VERSION=3.9.2 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.76.0
+ENV RUSTC_VERSION=1.75.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -639,8 +642,8 @@ FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
-    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
+    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -809,7 +812,6 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 Neon
-Copyright 2022 - 2024 Neon Inc.
+Copyright 2022 Neon Inc.

 The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
 See vendor/postgres-vX/COPYRIGHT for details.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -207,7 +207,6 @@ fn maybe_cgexec(cmd: &str) -> Command {

 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
-#[instrument(skip_all)]
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    let roles = spec
        .cluster
@@ -765,12 +764,7 @@ impl ComputeNode {
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
-        handle_grants(
-            spec,
-            &mut client,
-            connstr.as_str(),
-            self.has_feature(ComputeFeature::AnonExtension),
-        )?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
@@ -778,11 +772,12 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

-        // Run migrations separately to not hold up cold starts
-        thread::spawn(move || {
-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
-        });
+        if self.has_feature(ComputeFeature::Migrations) {
+            thread::spawn(move || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+        }
        Ok(())
    }

@@ -844,12 +839,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(
-                &spec,
-                &mut client,
-                self.connstr.as_str(),
-                self.has_feature(ComputeFeature::AnonExtension),
-            )?;
+            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
            // We can skip handle_migrations here because a new migration can only appear
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -138,34 +138,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    }
                }
                //
-                // Don't suspend compute if there is an active logical replication subscription
-                //
-                // `where pid is not null` – to filter out read only computes and subscription on branches
-                //
-                let logical_subscriptions_query =
-                    "select count(*) from pg_stat_subscription where pid is not null;";
-                match cli.query_one(logical_subscriptions_query, &[]) {
-                    Ok(row) => match row.try_get::<&str, i64>("count") {
-                        Ok(num_subscribers) => {
-                            if num_subscribers > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!(
-                            "failed to get list of active logical replication subscriptions: {:?}",
-                            e
-                        );
-                        continue;
-                    }
-                }
-                //
                // Do not suspend compute if autovacuum is running
                //
                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -264,10 +264,9 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    // case we miss some events for some reason. Not strictly necessary, but
    // better safe than sorry.
    let (tx, rx) = std::sync::mpsc::channel();
-    let watcher_res = notify::recommended_watcher(move |res| {
+    let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
        let _ = tx.send(res);
-    });
-    let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
+    }) {
        Ok(watcher) => (Box::new(watcher), rx),
        Err(e) => {
            match e.kind {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -581,12 +581,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(
-    spec: &ComputeSpec,
-    client: &mut Client,
-    connstr: &str,
-    enable_anon_extension: bool,
-) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
    info!("modifying database permissions");
    let existing_dbs = get_existing_dbs(client)?;

@@ -683,11 +678,6 @@ pub fn handle_grants(
            inlinify(&grant_query)
        );
        db_client.simple_query(&grant_query)?;
-
-        // it is important to run this after all grants
-        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
-        }
    }

    Ok(())
@@ -776,7 +766,6 @@ BEGIN
    END IF;
 END
 $$;"#,
-        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -820,125 +809,5 @@ $$;"#,
        "Ran {} migrations",
        (migrations.len() - starting_migration_id)
    );
-
-    Ok(())
-}
-
-/// Connect to the database as superuser and pre-create anon extension
-/// if it is present in shared_preload_libraries
-#[instrument(skip_all)]
-pub fn handle_extension_anon(
-    spec: &ComputeSpec,
-    db_owner: &str,
-    db_client: &mut Client,
-    grants_only: bool,
-) -> Result<()> {
-    info!("handle extension anon");
-
-    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-        if libs.contains("anon") {
-            if !grants_only {
-                // check if extension is already initialized using anon.is_initialized()
-                let query = "SELECT anon.is_initialized()";
-                match db_client.query(query, &[]) {
-                    Ok(rows) => {
-                        if !rows.is_empty() {
-                            let is_initialized: bool = rows[0].get(0);
-                            if is_initialized {
-                                info!("anon extension is already initialized");
-                                return Ok(());
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        warn!(
-                            "anon extension is_installed check failed with expected error: {}",
-                            e
-                        );
-                    }
-                };
-
-                // Create anon extension if this compute needs it
-                // Users cannot create it themselves, because superuser is required.
-                let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
-                info!("creating anon extension with query: {}", query);
-                match db_client.query(query, &[]) {
-                    Ok(_) => {}
-                    Err(e) => {
-                        error!("anon extension creation failed with error: {}", e);
-                        return Ok(());
-                    }
-                }
-
-                // check that extension is installed
-                query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-                let rows = db_client.query(query, &[])?;
-                if rows.is_empty() {
-                    error!("anon extension is not installed");
-                    return Ok(());
-                }
-
-                // Initialize anon extension
-                // This also requires superuser privileges, so users cannot do it themselves.
-                query = "SELECT anon.init()";
-                match db_client.query(query, &[]) {
-                    Ok(_) => {}
-                    Err(e) => {
-                        error!("anon.init() failed with error: {}", e);
-                        return Ok(());
-                    }
-                }
-            }
-
-            // check that extension is installed, if not bail early
-            let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
-            match db_client.query(query, &[]) {
-                Ok(rows) => {
-                    if rows.is_empty() {
-                        error!("anon extension is not installed");
-                        return Ok(());
-                    }
-                }
-                Err(e) => {
-                    error!("anon extension check failed with error: {}", e);
-                    return Ok(());
-                }
-            };
-
-            let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            // Grant permissions to db_owner to use anon extension functions
-            let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            // This is needed, because some functions are defined as SECURITY DEFINER.
-            // In Postgres SECURITY DEFINER functions are executed with the privileges
-            // of the owner.
-            // In anon extension this it is needed to access some GUCs, which are only accessible to
-            // superuser. But we've patched postgres to allow db_owner to access them as well.
-            // So we need to change owner of these functions to db_owner.
-            let query = format!("
-                SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
-                from pg_proc p
-                join pg_namespace nsp ON p.pronamespace = nsp.oid
-                where nsp.nspname = 'anon';", db_owner);
-
-            info!("change anon extension functions owner to db owner");
-            db_client.simple_query(&query)?;
-
-            //  affects views as well
-            let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-
-            let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
-            info!("granting anon extension permissions with query: {}", query);
-            db_client.simple_query(&query)?;
-        }
-    }
-
    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -24,9 +24,7 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true

-diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
-diesel_migrations = { version = "2.1.0" }
-r2d2 = { version = "0.8.10" }
+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }

 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -7,7 +7,6 @@ CREATE TABLE tenant_shards (
  generation INTEGER NOT NULL,
  generation_pageserver BIGINT NOT NULL,
  placement_policy VARCHAR NOT NULL,
-  splitting SMALLINT NOT NULL,
  -- config is JSON encoded, opaque to the database.
  config TEXT NOT NULL
 );
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -170,7 +170,7 @@ impl ComputeHook {
        reconfigure_request: &ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let req = client.request(Method::PUT, url);
+        let req = client.request(Method::POST, url);
        let req = if let Some(value) = &self.authorization_header {
            req.header(reqwest::header::AUTHORIZATION, value)
        } else {
@@ -240,15 +240,13 @@ impl ComputeHook {
        let client = reqwest::Client::new();
        backoff::retry(
            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
+            |e| matches!(e, NotifyError::Fatal(_)),
            3,
            10,
            "Send compute notification",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
        )
        .await
-        .ok_or_else(|| NotifyError::ShuttingDown)
-        .and_then(|x| x)
    }

    /// Call this to notify the compute (postgres) tier of new pageservers to use
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -3,8 +3,7 @@ use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
-    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TimelineCreateRequest,
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -42,7 +41,7 @@ pub struct HttpState {

 impl HttpState {
    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
-        let allowlist_routes = ["/status", "/ready", "/metrics"]
+        let allowlist_routes = ["/status"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -280,12 +279,6 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    json_response(StatusCode::OK, state.service.node_list().await?)
 }

-async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
-}
-
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -299,19 +292,6 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

-async fn handle_tenant_shard_split(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
-
-    json_response(
-        StatusCode::OK,
-        service.tenant_shard_split(tenant_id, split_req).await?,
-    )
-}
-
 async fn handle_tenant_shard_migrate(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -326,29 +306,11 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
 }

-/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
-/// with remote pageserver nodes).  This is intended for use as a kubernetes readiness probe.
-async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&req);
-    if state.service.startup_complete.is_ready() {
-        json_response(StatusCode::OK, ())
-    } else {
-        json_response(StatusCode::SERVICE_UNAVAILABLE, ())
-    }
-}
-
 impl From<ReconcileError> for ApiError {
    fn from(value: ReconcileError) -> Self {
        ApiError::Conflict(format!("Reconciliation error: {}", value))
@@ -404,7 +366,6 @@ pub fn make_router(
        .data(Arc::new(HttpState::new(service, auth)))
        // Non-prefixed generic endpoints (status, metrics)
        .get("/status", |r| request_span(r, handle_status))
-        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
            request_span(r, handle_re_attach)
@@ -415,12 +376,6 @@ pub fn make_router(
            request_span(r, handle_attach_hook)
        })
        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
-        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            request_span(r, handle_tenant_drop)
-        })
-        .post("/debug/v1/node/:node_id/drop", |r| {
-            request_span(r, handle_node_drop)
-        })
        .get("/control/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(r, handle_tenant_locate)
        })
@@ -436,9 +391,6 @@ pub fn make_router(
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
-        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(r, handle_tenant_shard_split)
-        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
@@ -451,6 +403,10 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
@@ -459,7 +415,7 @@ pub fn make_router(
            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
-        .get("/v1/tenant/:tenant_id", |r| {
+        .get("/v1/tenant/:tenant_id*", |r| {
            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
@@ -467,4 +423,8 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -4,14 +4,13 @@
 /// This enables running & testing pageservers without a full-blown
 /// deployment of the Neon cloud platform.
 ///
-use anyhow::{anyhow, Context};
+use anyhow::anyhow;
 use attachment_service::http::make_router;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{self, BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
-use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
@@ -23,9 +22,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-use diesel_migrations::{embed_migrations, EmbeddedMigrations};
-pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
-
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
 #[command(arg_required_else_help(true))]
@@ -34,9 +30,9 @@ struct Cli {
    #[arg(short, long)]
    listen: std::net::SocketAddr,

-    /// Public key for JWT authentication of clients
+    /// Path to public key for JWT authentication of clients
    #[arg(long)]
-    public_key: Option<String>,
+    public_key: Option<camino::Utf8PathBuf>,

    /// Token for authenticating this service with the pageservers it controls
    #[arg(long)]
@@ -57,7 +53,7 @@ struct Cli {

    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
-    database_url: Option<String>,
+    database_url: String,
 }

 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -78,9 +74,10 @@ impl Secrets {
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";

    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        match &args.database_url {
-            Some(url) => Self::load_cli(url, args),
-            None => Self::load_aws_sm().await,
+        if args.database_url.is_empty() {
+            Self::load_aws_sm().await
+        } else {
+            Self::load_cli(args)
        }
    }

@@ -156,13 +153,13 @@ impl Secrets {
        })
    }

-    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+    fn load_cli(args: &Cli) -> anyhow::Result<Self> {
        let public_key = match &args.public_key {
            None => None,
-            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+            Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
        };
        Ok(Self {
-            database_url: database_url.to_owned(),
+            database_url: args.database_url.clone(),
            public_key,
            jwt_token: args.jwt_token.clone(),
            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
@@ -170,32 +167,8 @@ impl Secrets {
    }
 }

-/// Execute the diesel migrations that are built into this binary
-async fn migration_run(database_url: &str) -> anyhow::Result<()> {
-    use diesel::PgConnection;
-    use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-    let mut conn = PgConnection::establish(database_url)?;
-
-    HarnessWithOutput::write_to_stdout(&mut conn)
-        .run_pending_migrations(MIGRATIONS)
-        .map(|_| ())
-        .map_err(|e| anyhow::anyhow!(e))?;
-
-    Ok(())
-}
-
-fn main() -> anyhow::Result<()> {
-    tokio::runtime::Builder::new_current_thread()
-        // We use spawn_blocking for database operations, so require approximately
-        // as many blocking threads as we will open database connections.
-        .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize)
-        .enable_all()
-        .build()
-        .unwrap()
-        .block_on(async_main())
-}
-
-async fn async_main() -> anyhow::Result<()> {
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));

    logging::init(
@@ -222,11 +195,6 @@ async fn async_main() -> anyhow::Result<()> {
        compute_hook_url: args.compute_hook_url,
    };

-    // After loading secrets & config, but before starting anything else, apply database migrations
-    migration_run(&secrets.database_url)
-        .await
-        .context("Running database migrations")?;
-
    let json_path = args.path;
    let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,9 +1,6 @@
-pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
-use std::time::Duration;

-use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
@@ -47,7 +44,7 @@ use crate::PlacementPolicy;
 /// updated, and reads of nodes are always from memory, not the database.  We only require that
 /// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    connection_pool: diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<PgConnection>>,
+    database_url: String,

    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
@@ -67,8 +64,6 @@ pub(crate) enum DatabaseError {
    Query(#[from] diesel::result::Error),
    #[error(transparent)]
    Connection(#[from] diesel::result::ConnectionError),
-    #[error(transparent)]
-    ConnectionPool(#[from] r2d2::Error),
    #[error("Logical error: {0}")]
    Logical(String),
 }
@@ -76,31 +71,9 @@ pub(crate) enum DatabaseError {
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

 impl Persistence {
-    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
-    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
-    pub const MAX_CONNECTIONS: u32 = 99;
-
-    // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used.
-    const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10);
-    const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60);
-
    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
-        let manager = diesel::r2d2::ConnectionManager::<PgConnection>::new(database_url);
-
-        // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time
-        // to execute queries (database queries are not generally on latency-sensitive paths).
-        let connection_pool = diesel::r2d2::Pool::builder()
-            .max_size(Self::MAX_CONNECTIONS)
-            .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME))
-            .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT))
-            // Always keep at least one connection ready to go
-            .min_idle(Some(1))
-            .test_on_check_out(true)
-            .build(manager)
-            .expect("Could not build connection pool");
-
        Self {
-            connection_pool,
+            database_url,
            json_path,
        }
    }
@@ -111,10 +84,14 @@ impl Persistence {
        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
-        let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
-            .await
-            .expect("Task panic")
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
    }

    /// When a node is first registered, persist it before using it for anything
@@ -260,6 +237,7 @@ impl Persistence {

    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
    /// the tenant from memory on this server.
+    #[allow(unused)]
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_conn(move |conn| -> DatabaseResult<()> {
@@ -272,18 +250,6 @@ impl Persistence {
        .await
    }

-    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
-        use crate::schema::nodes::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(nodes)
-                .filter(node_id.eq(del_node_id.0 as i64))
-                .execute(conn)?;
-
-            Ok(())
-        })
-        .await
-    }
-
    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
    /// the node that called /re-attach.
@@ -376,107 +342,19 @@ impl Persistence {
        Ok(())
    }

-    // When we start shard splitting, we must durably mark the tenant so that
-    // on restart, we know that we must go through recovery.
-    //
-    // We create the child shards here, so that they will be available for increment_generation calls
-    // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
+    // TODO: when we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery (list shards that exist
+    // and pick up where we left off and/or revert to parent shards).
    #[allow(dead_code)]
-    pub(crate) async fn begin_shard_split(
-        &self,
-        old_shard_count: ShardCount,
-        split_tenant_id: TenantId,
-        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
-
-                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
-
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != expect_parent_records {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
-                    ));
-                }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
-
-            Ok(())
-        })
-        .await
+    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
    }

-    // When we finish shard splitting, we must atomically clean up the old shards
+    // TODO: when we finish shard splitting, we must atomically clean up the old shards
    // and insert the new shards, and clear the splitting marker.
    #[allow(dead_code)]
-    pub(crate) async fn complete_shard_split(
-        &self,
-        split_tenant_id: TenantId,
-        old_shard_count: ShardCount,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                // Drop parent shards
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
-                    .execute(conn)?;
-
-                // Clear sharding flag
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .set((splitting.eq(0),))
-                    .execute(conn)?;
-                debug_assert!(updated > 0);
-
-                Ok(())
-            })?;
-
-            Ok(())
-        })
-        .await
+    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
    }
 }

@@ -504,8 +382,6 @@ pub(crate) struct TenantShardPersistence {
    #[serde(default)]
    pub(crate) placement_policy: String,
    #[serde(default)]
-    pub(crate) splitting: SplitState,
-    #[serde(default)]
    pub(crate) config: String,
 }

--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
@@ -1,46 +0,0 @@
-use diesel::pg::{Pg, PgValue};
-use diesel::{
-    deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
-    sql_types::Int2,
-};
-use serde::{Deserialize, Serialize};
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
-#[diesel(sql_type = SplitStateSQLRepr)]
-#[derive(Deserialize, Serialize)]
-pub enum SplitState {
-    Idle = 0,
-    Splitting = 1,
-}
-
-impl Default for SplitState {
-    fn default() -> Self {
-        Self::Idle
-    }
-}
-
-type SplitStateSQLRepr = Int2;
-
-impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
-    fn to_sql<'a>(
-        &'a self,
-        out: &'a mut diesel::serialize::Output<Pg>,
-    ) -> diesel::serialize::Result {
-        let raw_value: i16 = *self as i16;
-        let mut new_out = out.reborrow();
-        ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
-    }
-}
-
-impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
-    fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
-        match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
-            0 => Some(Self::Idle),
-            1 => Some(Self::Splitting),
-            _ => None,
-        })? {
-            Some(v) => Ok(v),
-            None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
-        }
-    }
-}
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -20,7 +20,6 @@ diesel::table! {
        generation -> Int4,
        generation_pageserver -> Int8,
        placement_policy -> Varchar,
-        splitting -> Int2,
        config -> Text,
    }
 }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,6 +1,5 @@
 use std::{
-    cmp::Ordering,
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap},
    str::FromStr,
    sync::Arc,
    time::{Duration, Instant},
@@ -24,14 +23,13 @@ use pageserver_api::{
    models::{
        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
 use utils::{
-    backoff,
    completion::Barrier,
    generation::Generation,
    http::error::ApiError,
@@ -42,11 +40,7 @@ use utils::{
 use crate::{
    compute_hook::{self, ComputeHook},
    node::Node,
-    persistence::{
-        split_state::SplitState, DatabaseError, NodePersistence, Persistence,
-        TenantShardPersistence,
-    },
-    reconciler::attached_location_conf,
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
    scheduler::Scheduler,
    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -109,9 +103,7 @@ impl From<DatabaseError> for ApiError {
        match err {
            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
-            DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => {
-                ApiError::ShuttingDown
-            }
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
            DatabaseError::Logical(reason) => {
                ApiError::InternalServerError(anyhow::anyhow!(reason))
            }
@@ -151,71 +143,31 @@ impl Service {
        // indeterminate, same as in [`ObservedStateLocation`])
        let mut observed = HashMap::new();

-        let mut nodes_online = HashSet::new();
-
-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };

        // TODO: issue these requests concurrently
-        {
-            let nodes = {
-                let locked = self.inner.read().unwrap();
-                locked.nodes.clone()
-            };
-            for node in nodes.values() {
-                let http_client = reqwest::ClientBuilder::new()
-                    .timeout(Duration::from_secs(5))
-                    .build()
-                    .expect("Failed to construct HTTP client");
-                let client = mgmt_api::Client::from_client(
-                    http_client,
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());

-                fn is_fatal(e: &mgmt_api::Error) -> bool {
-                    use mgmt_api::Error::*;
-                    match e {
-                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                        ApiError(_, _) => true,
-                    }
+            tracing::info!("Scanning shards on node {}...", node.id);
+            match client.list_location_config().await {
+                Err(e) => {
+                    tracing::warn!("Could not contact pageserver {} ({e})", node.id);
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                }
+                Ok(listing) => {
+                    tracing::info!(
+                        "Received {} shard statuses from pageserver {}, setting it to Active",
+                        listing.tenant_shards.len(),
+                        node.id
+                    );

-                let list_response = backoff::retry(
-                    || client.list_location_config(),
-                    is_fatal,
-                    1,
-                    5,
-                    "Location config listing",
-                    &cancel,
-                )
-                .await;
-                let Some(list_response) = list_response else {
-                    tracing::info!("Shutdown during startup_reconcile");
-                    return;
-                };
-
-                tracing::info!("Scanning shards on node {}...", node.id);
-                match list_response {
-                    Err(e) => {
-                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                        // TODO: be more tolerant, do some retries, in case
-                        // pageserver is being restarted at the same time as we are
-                    }
-                    Ok(listing) => {
-                        tracing::info!(
-                            "Received {} shard statuses from pageserver {}, setting it to Active",
-                            listing.tenant_shards.len(),
-                            node.id
-                        );
-                        nodes_online.insert(node.id);
-
-                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                            observed.insert(tenant_shard_id, (node.id, conf_opt));
-                        }
+                    for (tenant_shard_id, conf_opt) in listing.tenant_shards {
+                        observed.insert(tenant_shard_id, (node.id, conf_opt));
                    }
                }
            }
@@ -226,19 +178,8 @@ impl Service {
        let mut compute_notifications = Vec::new();

        // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let (shard_count, nodes) = {
+        let shard_count = {
            let mut locked = self.inner.write().unwrap();
-
-            // Mark nodes online if they responded to us: nodes are offline by default after a restart.
-            let mut nodes = (*locked.nodes).clone();
-            for (node_id, node) in nodes.iter_mut() {
-                if nodes_online.contains(node_id) {
-                    node.availability = NodeAvailability::Active;
-                }
-            }
-            locked.nodes = Arc::new(nodes);
-            let nodes = locked.nodes.clone();
-
            for (tenant_shard_id, (node_id, observed_loc)) in observed {
                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
                    cleanup.push((tenant_shard_id, node_id));
@@ -270,7 +211,7 @@ impl Service {
                }
            }

-            (locked.tenants.len(), nodes)
+            locked.tenants.len()
        };

        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
@@ -331,8 +272,9 @@ impl Service {
        let stream = futures::stream::iter(compute_notifications.into_iter())
            .map(|(tenant_shard_id, node_id)| {
                let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
                async move {
+                    // TODO: give Service a cancellation token for clean shutdown
+                    let cancel = CancellationToken::new();
                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                        tracing::error!(
                            tenant_shard_id=%tenant_shard_id,
@@ -438,7 +380,7 @@ impl Service {
            ))),
            config,
            persistence,
-            startup_complete: startup_complete.clone(),
+            startup_complete,
        });

        let result_task_this = this.clone();
@@ -532,7 +474,6 @@ impl Service {
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
-                splitting: SplitState::default(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -775,7 +716,6 @@ impl Service {
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
-                splitting: SplitState::default(),
            })
            .collect();
        self.persistence
@@ -1035,10 +975,6 @@ impl Service {
            }
        };

-        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
-        // because otherwise a broken compute hook causes a feedback loop where
-        // location_config returns 500 and gets retried forever.
-
        if let Some(create_req) = maybe_create {
            let create_resp = self.tenant_create(create_req).await?;
            result.shards = create_resp
@@ -1051,15 +987,7 @@ impl Service {
                .collect();
        } else {
            // This was an update, wait for reconciliation
-            if let Err(e) = self.await_waiters(waiters).await {
-                // Do not treat a reconcile error as fatal: we have already applied any requested
-                // Intent changes, and the reconcile can fail for external reasons like unavailable
-                // compute notification API.  In these cases, it is important that we do not
-                // cause the cloud control plane to retry forever on this API.
-                tracing::warn!(
-                    "Failed to reconcile after /location_config: {e}, returning success anyway"
-                );
-            }
+            self.await_waiters(waiters).await?;
        }

        Ok(result)
@@ -1162,7 +1090,6 @@ impl Service {
        self.ensure_attached_wait(tenant_id).await?;

        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -1243,7 +1170,6 @@ impl Service {
        self.ensure_attached_wait(tenant_id).await?;

        // TODO: refuse to do this if shard splitting is in progress
-        // (https://github.com/neondatabase/neon/issues/6676)
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -1416,326 +1342,6 @@ impl Service {
        })
    }

-    pub(crate) async fn tenant_shard_split(
-        &self,
-        tenant_id: TenantId,
-        split_req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse, ApiError> {
-        let mut policy = None;
-        let mut shard_ident = None;
-
-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
-        // A parent shard which will be split
-        struct SplitTarget {
-            parent_id: TenantShardId,
-            node: Node,
-            child_ids: Vec<TenantShardId>,
-        }
-
-        // Validate input, and calculate which shards we will create
-        let (old_shard_count, targets, compute_hook) = {
-            let locked = self.inner.read().unwrap();
-
-            let pageservers = locked.nodes.clone();
-
-            let mut targets = Vec::new();
-
-            // In case this is a retry, count how many already-split shards we found
-            let mut children_found = Vec::new();
-            let mut old_shard_count = None;
-
-            for (tenant_shard_id, shard) in
-                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-            {
-                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
-                    Ordering::Equal => {
-                        //  Already split this
-                        children_found.push(*tenant_shard_id);
-                        continue;
-                    }
-                    Ordering::Greater => {
-                        return Err(ApiError::BadRequest(anyhow::anyhow!(
-                            "Requested count {} but already have shards at count {}",
-                            split_req.new_shard_count,
-                            shard.shard.count.0
-                        )));
-                    }
-                    Ordering::Less => {
-                        // Fall through: this shard has lower count than requested,
-                        // is a candidate for splitting.
-                    }
-                }
-
-                match old_shard_count {
-                    None => old_shard_count = Some(shard.shard.count),
-                    Some(old_shard_count) => {
-                        if old_shard_count != shard.shard.count {
-                            // We may hit this case if a caller asked for two splits to
-                            // different sizes, before the first one is complete.
-                            // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
-                            // of shard_count=1 and shard_count=2 shards in the map.
-                            return Err(ApiError::Conflict(
-                                "Cannot split, currently mid-split".to_string(),
-                            ));
-                        }
-                    }
-                }
-                if policy.is_none() {
-                    policy = Some(shard.policy.clone());
-                }
-                if shard_ident.is_none() {
-                    shard_ident = Some(shard.shard);
-                }
-
-                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
-                    tracing::info!(
-                        "Tenant shard {} already has shard count {}",
-                        tenant_shard_id,
-                        split_req.new_shard_count
-                    );
-                    continue;
-                }
-
-                let node_id =
-                    shard
-                        .intent
-                        .attached
-                        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
-                            "Cannot split a tenant that is not attached"
-                        )))?;
-
-                let node = pageservers
-                    .get(&node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                // TODO: if any reconciliation is currently in progress for this shard, wait for it.
-
-                targets.push(SplitTarget {
-                    parent_id: *tenant_shard_id,
-                    node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
-                });
-            }
-
-            if targets.is_empty() {
-                if children_found.len() == split_req.new_shard_count as usize {
-                    return Ok(TenantShardSplitResponse {
-                        new_shards: children_found,
-                    });
-                } else {
-                    // No shards found to split, and no existing children found: the
-                    // tenant doesn't exist at all.
-                    return Err(ApiError::NotFound(
-                        anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
-                    ));
-                }
-            }
-
-            (old_shard_count, targets, locked.compute_hook.clone())
-        };
-
-        // unwrap safety: we would have returned above if we didn't find at least one shard to split
-        let old_shard_count = old_shard_count.unwrap();
-        let shard_ident = shard_ident.unwrap();
-        let policy = policy.unwrap();
-
-        // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
-        // request could occur here, deleting or mutating the tenant.  begin_shard_split checks that the
-        // parent shards exist as expected, but it would be neater to do the above pre-checks within the
-        // same database transaction rather than pre-check in-memory and then maybe-fail the database write.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
-        // Before creating any new child shards in memory or on the pageservers, persist them: this
-        // enables us to ensure that we will always be able to clean up if something goes wrong.  This also
-        // acts as the protection against two concurrent attempts to split: one of them will get a database
-        // error trying to insert the child shards.
-        let mut child_tsps = Vec::new();
-        for target in &targets {
-            let mut this_child_tsps = Vec::new();
-            for child in &target.child_ids {
-                let mut child_shard = shard_ident;
-                child_shard.number = child.shard_number;
-                child_shard.count = child.shard_count;
-
-                this_child_tsps.push(TenantShardPersistence {
-                    tenant_id: child.tenant_id.to_string(),
-                    shard_number: child.shard_number.0 as i32,
-                    shard_count: child.shard_count.0 as i32,
-                    shard_stripe_size: shard_ident.stripe_size.0 as i32,
-                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
-                    // populate the correct generation as part of its transaction, to protect us
-                    // against racing with changes in the state of the parent.
-                    generation: 0,
-                    generation_pageserver: target.node.id.0 as i64,
-                    placement_policy: serde_json::to_string(&policy).unwrap(),
-                    // TODO: get the config out of the map
-                    config: serde_json::to_string(&TenantConfig::default()).unwrap(),
-                    splitting: SplitState::Splitting,
-                });
-            }
-
-            child_tsps.push((target.parent_id, this_child_tsps));
-        }
-
-        if let Err(e) = self
-            .persistence
-            .begin_shard_split(old_shard_count, tenant_id, child_tsps)
-            .await
-        {
-            match e {
-                DatabaseError::Query(diesel::result::Error::DatabaseError(
-                    DatabaseErrorKind::UniqueViolation,
-                    _,
-                )) => {
-                    // Inserting a child shard violated a unique constraint: we raced with another call to
-                    // this function
-                    tracing::warn!("Conflicting attempt to split {tenant_id}: {e}");
-                    return Err(ApiError::Conflict("Tenant is already splitting".into()));
-                }
-                _ => return Err(ApiError::InternalServerError(e.into())),
-            }
-        }
-
-        // FIXME: we have now committed the shard split state to the database, so any subsequent
-        // failure needs to roll it back.  We will later wrap this function in logic to roll back
-        // the split if it fails.
-        // (https://github.com/neondatabase/neon/issues/6676)
-
-        // TODO: issue split calls concurrently (this only matters once we're splitting
-        // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
-
-        for target in &targets {
-            let SplitTarget {
-                parent_id,
-                node,
-                child_ids,
-            } = target;
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-            let response = client
-                .tenant_shard_split(
-                    *parent_id,
-                    TenantShardSplitRequest {
-                        new_shard_count: split_req.new_shard_count,
-                    },
-                )
-                .await
-                .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
-
-            tracing::info!(
-                "Split {} into {}",
-                parent_id,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-
-            if &response.new_shards != child_ids {
-                // This should never happen: the pageserver should agree with us on how shard splits work.
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})",
-                    parent_id,
-                    response.new_shards,
-                    child_ids
-                )));
-            }
-        }
-
-        // TODO: if the pageserver restarted concurrently with our split API call,
-        // the actual generation of the child shard might differ from the generation
-        // we expect it to have.  In order for our in-database generation to end up
-        // correct, we should carry the child generation back in the response and apply it here
-        // in complete_shard_split (and apply the correct generation in memory)
-        // (or, we can carry generation in the request and reject the request if
-        //  it doesn't match, but that requires more retry logic on this side)
-
-        self.persistence
-            .complete_shard_split(tenant_id, old_shard_count)
-            .await?;
-
-        // Replace all the shards we just split with their children
-        let mut response = TenantShardSplitResponse {
-            new_shards: Vec::new(),
-        };
-        let mut child_locations = Vec::new();
-        {
-            let mut locked = self.inner.write().unwrap();
-            for target in targets {
-                let SplitTarget {
-                    parent_id,
-                    node: _node,
-                    child_ids,
-                } = target;
-                let (pageserver, generation, config) = {
-                    let old_state = locked
-                        .tenants
-                        .remove(&parent_id)
-                        .expect("It was present, we just split it");
-                    (
-                        old_state.intent.attached.unwrap(),
-                        old_state.generation,
-                        old_state.config.clone(),
-                    )
-                };
-
-                locked.tenants.remove(&parent_id);
-
-                for child in child_ids {
-                    let mut child_shard = shard_ident;
-                    child_shard.number = child.shard_number;
-                    child_shard.count = child.shard_count;
-
-                    let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
-                    child_observed.insert(
-                        pageserver,
-                        ObservedStateLocation {
-                            conf: Some(attached_location_conf(generation, &child_shard, &config)),
-                        },
-                    );
-
-                    let mut child_state = TenantState::new(child, child_shard, policy.clone());
-                    child_state.intent = IntentState::single(Some(pageserver));
-                    child_state.observed = ObservedState {
-                        locations: child_observed,
-                    };
-                    child_state.generation = generation;
-                    child_state.config = config.clone();
-
-                    child_locations.push((child, pageserver));
-
-                    locked.tenants.insert(child, child_state);
-                    response.new_shards.push(child);
-                }
-            }
-        }
-
-        // Send compute notifications for all the new shards
-        let mut failed_notifications = Vec::new();
-        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
-                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
-                        child_id, child_ps);
-                failed_notifications.push(child_id);
-            }
-        }
-
-        // If we failed any compute notifications, make a note to retry later.
-        if !failed_notifications.is_empty() {
-            let mut locked = self.inner.write().unwrap();
-            for failed in failed_notifications {
-                if let Some(shard) = locked.tenants.get_mut(&failed) {
-                    shard.pending_compute_notification = true;
-                }
-            }
-        }
-
-        Ok(response)
-    }
-
    pub(crate) async fn tenant_shard_migrate(
        &self,
        tenant_shard_id: TenantShardId,
@@ -1804,45 +1410,6 @@ impl Service {
        Ok(TenantShardMigrateResponse {})
    }

-    /// This is for debug/support only: we simply drop all state for a tenant, without
-    /// detaching or deleting it on pageservers.
-    pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
-        self.persistence.delete_tenant(tenant_id).await?;
-
-        let mut locked = self.inner.write().unwrap();
-        let mut shards = Vec::new();
-        for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
-            shards.push(*tenant_shard_id);
-        }
-
-        for shard in shards {
-            locked.tenants.remove(&shard);
-        }
-
-        Ok(())
-    }
-
-    /// This is for debug/support only: we simply drop all state for a tenant, without
-    /// detaching or deleting it on pageservers.  We do not try and re-schedule any
-    /// tenants that were on this node.
-    ///
-    /// TODO: proper node deletion API that unhooks things more gracefully
-    pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
-        self.persistence.delete_node(node_id).await?;
-
-        let mut locked = self.inner.write().unwrap();
-
-        for shard in locked.tenants.values_mut() {
-            shard.deref_node(node_id);
-        }
-
-        let mut nodes = (*locked.nodes).clone();
-        nodes.remove(&node_id);
-        locked.nodes = Arc::new(nodes);
-
-        Ok(())
-    }
-
    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
        // It is convenient to avoid taking the big lock and converting Node to a serializable
        // structure, by fetching from storage instead of reading in-memory state.
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -193,13 +193,6 @@ impl IntentState {
        result
    }

-    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
-        Self {
-            attached: node_id,
-            secondary: vec![],
-        }
-    }
-
    /// When a node goes offline, we update intents to avoid using it
    /// as their attached pageserver.
    ///
@@ -293,9 +286,6 @@ impl TenantState {
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.

-        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
-        // change their attach location.
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut used_pageservers = self.intent.all_pageservers();
@@ -534,18 +524,4 @@ impl TenantState {
            seq: self.sequence,
        })
    }
-
-    // If we had any state at all referring to this node ID, drop it.  Does not
-    // attempt to reschedule.
-    pub(crate) fn deref_node(&mut self, node_id: NodeId) {
-        if self.intent.attached == Some(node_id) {
-            self.intent.attached = None;
-        }
-
-        self.intent.secondary.retain(|n| n != &node_id);
-
-        self.observed.locations.remove(&node_id);
-
-        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
-    }
 }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,17 +1,20 @@
 use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
-    models::{
-        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
-    },
+    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::str::FromStr;
+use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -25,7 +28,7 @@ pub struct AttachmentService {
    listen: String,
    path: Utf8PathBuf,
    jwt_token: Option<String>,
-    public_key: Option<String>,
+    public_key_path: Option<Utf8PathBuf>,
    postgres_port: u16,
    client: reqwest::Client,
 }
@@ -204,7 +207,7 @@ impl AttachmentService {
            .pageservers
            .first()
            .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key) = match ps_conf.http_auth_type {
+        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
                let jwt_token = env
@@ -216,26 +219,7 @@ impl AttachmentService {
                let public_key_path =
                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
                        .unwrap();
-
-                // This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
-                let public_key = if std::fs::metadata(&public_key_path)
-                    .expect("Can't stat public key")
-                    .is_dir()
-                {
-                    // Our config may specify a directory: this is for the pageserver's ability to handle multiple
-                    // keys.  We only use one key at a time, so, arbitrarily load the first one in the directory.
-                    let mut dir =
-                        std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
-                    let dent = dir
-                        .next()
-                        .expect("Empty key dir")
-                        .expect("Error reading key dir");
-
-                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
-                } else {
-                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
-                };
-                (Some(jwt_token), Some(public_key))
+                (Some(jwt_token), Some(public_key_path))
            }
        };

@@ -244,7 +228,7 @@ impl AttachmentService {
            path,
            listen,
            jwt_token,
-            public_key,
+            public_key_path,
            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
@@ -267,6 +251,37 @@ impl AttachmentService {
        .expect("non-Unicode path")
    }

+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
    ///
    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
@@ -306,32 +321,69 @@ impl AttachmentService {
    ///
    /// Returns the database url
    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        const DB_NAME: &str = "attachment_service";
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }

-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let createdb_path = pg_bin_dir.join("createdb");
-        let output = Command::new(&createdb_path)
-            .args([
-                "-h",
-                "localhost",
-                "-p",
-                &format!("{}", self.postgres_port),
-                &DB_NAME,
-            ])
-            .output()
-            .await
-            .expect("Failed to spawn createdb");
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }

-        if !output.status.success() {
-            let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
-            if stderr.contains("already exists") {
-                tracing::info!("Database {DB_NAME} already exists");
-            } else {
-                anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
            }
        }

+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
        Ok(database_url)
    }

@@ -401,8 +453,8 @@ impl AttachmentService {
            args.push(format!("--jwt-token={jwt_token}"));
        }

-        if let Some(public_key) = &self.public_key {
-            args.push(format!("--public-key=\"{public_key}\""));
+        if let Some(public_key_path) = &self.public_key_path {
+            args.push(format!("--public-key={public_key_path}"));
        }

        if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
@@ -577,7 +629,7 @@ impl AttachmentService {
    ) -> anyhow::Result<TenantShardMigrateResponse> {
        self.dispatch(
            Method::PUT,
-            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
+            format!("tenant/{tenant_shard_id}/migrate"),
            Some(TenantShardMigrateRequest {
                tenant_shard_id,
                node_id,
@@ -586,20 +638,6 @@ impl AttachmentService {
        .await
    }

-    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
-    pub async fn tenant_split(
-        &self,
-        tenant_id: TenantId,
-        new_shard_count: u8,
-    ) -> anyhow::Result<TenantShardSplitResponse> {
-        self.dispatch(
-            Method::PUT,
-            format!("control/v1/tenant/{tenant_id}/shard_split"),
-            Some(TenantShardSplitRequest { new_shard_count }),
-        )
-        .await
-    }
-
    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -72,6 +72,7 @@ where
    let log_path = datadir.join(format!("{process_name}.log"));
    let process_log_file = fs::OpenOptions::new()
        .create(true)
+        .write(true)
        .append(true)
        .open(&log_path)
        .with_context(|| {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -575,26 +575,6 @@ async fn handle_tenant(
            println!("{tenant_table}");
            println!("{shard_table}");
        }
-        Some(("shard-split", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
-            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
-
-            let attachment_service = AttachmentService::from_env(env);
-            let result = attachment_service
-                .tenant_split(tenant_id, shard_count)
-                .await?;
-            println!(
-                "Split tenant {} into shards {}",
-                tenant_id,
-                result
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -1014,13 +994,12 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
            let destroy = sub_args.get_flag("destroy");
-            let mode = sub_args.get_one::<String>("mode").expect("has a default");

            let endpoint = cplane
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(mode, destroy)?;
+            endpoint.stop(destroy)?;
        }

        _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
@@ -1304,7 +1283,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1545,11 +1524,6 @@ fn cli() -> Command {
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
-            .subcommand(Command::new("shard-split")
-                .about("Increase the number of shards in the tenant")
-                .arg(tenant_id_arg.clone())
-                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
-                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1653,16 +1627,7 @@ fn cli() -> Command {
                            .long("destroy")
                            .action(ArgAction::SetTrue)
                            .required(false)
-                    )
-                    .arg(
-                        Arg::new("mode")
-                            .help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
-                            .long("mode")
-                            .action(ArgAction::Set)
-                            .required(false)
-                            .value_parser(["smart", "fast", "immediate"])
-                            .default_value("fast")
-                    )
+                        )
                )

        )
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -761,8 +761,22 @@ impl Endpoint {
        }
    }

-    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
-        self.pg_ctl(&["-m", mode, "stop"], &None)?;
+    pub fn stop(&self, destroy: bool) -> Result<()> {
+        // If we are going to destroy data directory,
+        // use immediate shutdown mode, otherwise,
+        // shutdown gracefully to leave the data directory sane.
+        //
+        // Postgres is always started from scratch, so stop
+        // without destroy only used for testing and debugging.
+        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;

        // Also wait for the compute_ctl process to die. It might have some
        // cleanup work to do after postgres stops, like syncing safekeepers,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,8 +90,8 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

-    /// Pre-install and initialize anon extension for every database in the cluster
-    AnonExtension,
+    /// Enable running migrations
+    Migrations,

    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -13,9 +13,6 @@ twox-hash.workspace = true

 workspace_hack.workspace = true

-[target.'cfg(target_os = "linux")'.dependencies]
-procfs.workspace = true
-
 [dev-dependencies]
 rand = "0.8"
 rand_distr = "0.4.3"
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -31,8 +31,6 @@ pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub mod metric_vec_duration;
 pub use hll::{HyperLogLog, HyperLogLogVec};
-#[cfg(target_os = "linux")]
-pub mod more_process_metrics;

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -1,54 +0,0 @@
-//! process metrics that the [`::prometheus`] crate doesn't provide.
-
-// This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
-
-use crate::UIntGauge;
-
-pub struct Collector {
-    descs: Vec<prometheus::core::Desc>,
-    vmlck: crate::UIntGauge,
-}
-
-const NMETRICS: usize = 1;
-
-impl prometheus::core::Collector for Collector {
-    fn desc(&self) -> Vec<&prometheus::core::Desc> {
-        self.descs.iter().collect()
-    }
-
-    fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
-        let Ok(myself) = procfs::process::Process::myself() else {
-            return vec![];
-        };
-        let mut mfs = Vec::with_capacity(NMETRICS);
-        if let Ok(status) = myself.status() {
-            if let Some(vmlck) = status.vmlck {
-                self.vmlck.set(vmlck);
-                mfs.extend(self.vmlck.collect())
-            }
-        }
-        mfs
-    }
-}
-
-impl Collector {
-    pub fn new() -> Self {
-        let mut descs = Vec::new();
-
-        let vmlck =
-            UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap();
-        descs.extend(
-            prometheus::core::Collector::desc(&vmlck)
-                .into_iter()
-                .cloned(),
-        );
-
-        Self { descs, vmlck }
-    }
-}
-
-impl Default for Collector {
-    fn default() -> Self {
-        Self::new()
-    }
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -192,16 +192,6 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantShardSplitRequest {
-    pub new_shard_count: u8,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantShardSplitResponse {
-    pub new_shards: Vec<TenantShardId>,
-}
-
 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
@@ -659,27 +649,6 @@ pub struct WalRedoManagerStatus {
    pub pid: Option<u32>,
 }

-pub mod virtual_file {
-    #[derive(
-        Copy,
-        Clone,
-        PartialEq,
-        Eq,
-        Hash,
-        strum_macros::EnumString,
-        strum_macros::Display,
-        serde_with::DeserializeFromStr,
-        serde_with::SerializeDisplay,
-        Debug,
-    )]
-    #[strum(serialize_all = "kebab-case")]
-    pub enum IoEngineKind {
-        StdFs,
-        #[cfg(target_os = "linux")]
-        TokioEpollUring,
-    }
-}
-
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -88,36 +88,12 @@ impl TenantShardId {
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
    pub fn to_index(&self) -> ShardIndex {
        ShardIndex {
            shard_number: self.shard_number,
            shard_count: self.shard_count,
        }
    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
 }

 /// Formatting helper
@@ -817,108 +793,4 @@ mod tests {
        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
        assert_eq!(shard, ShardNumber(8));
    }
-
-    #[test]
-    fn shard_id_split() {
-        let tenant_id = TenantId::generate();
-        let parent = TenantShardId::unsharded(tenant_id);
-
-        // Unsharded into 2
-        assert_eq!(
-            parent.split(ShardCount(2)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(1)
-                }
-            ]
-        );
-
-        // Unsharded into 4
-        assert_eq!(
-            parent.split(ShardCount(4)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(1)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(2)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(4),
-                    shard_number: ShardNumber(3)
-                }
-            ]
-        );
-
-        // count=1 into 2 (check this works the same as unsharded.)
-        let parent = TenantShardId {
-            tenant_id,
-            shard_count: ShardCount(1),
-            shard_number: ShardNumber(0),
-        };
-        assert_eq!(
-            parent.split(ShardCount(2)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(0)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(2),
-                    shard_number: ShardNumber(1)
-                }
-            ]
-        );
-
-        // count=2 into count=8
-        let parent = TenantShardId {
-            tenant_id,
-            shard_count: ShardCount(2),
-            shard_number: ShardNumber(1),
-        };
-        assert_eq!(
-            parent.split(ShardCount(8)),
-            vec![
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(1)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(3)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(5)
-                },
-                TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount(8),
-                    shard_number: ShardNumber(7)
-                },
-            ]
-        );
-    }
 }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -191,7 +191,6 @@ impl RemoteStorage for AzureBlobStorage {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> anyhow::Result<Listing, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -224,8 +223,6 @@ impl RemoteStorage for AzureBlobStorage {

        let mut response = builder.into_stream();
        let mut res = Listing::default();
-        // NonZeroU32 doesn't support subtraction apparently
-        let mut max_keys = max_keys.map(|mk| mk.get());
        while let Some(l) = response.next().await {
            let entry = l.map_err(to_download_error)?;
            let prefix_iter = entry
@@ -238,18 +235,7 @@ impl RemoteStorage for AzureBlobStorage {
                .blobs
                .blobs()
                .map(|k| self.name_to_relative_path(&k.name));
-
-            for key in blob_iter {
-                res.keys.push(key);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(res); // limit reached
-                    }
-                    max_keys = Some(mk);
-                }
-            }
+            res.keys.extend(blob_iter);
        }
        Ok(res)
    }
@@ -393,7 +379,7 @@ impl RemoteStorage for AzureBlobStorage {
        _prefix: Option<&RemotePath>,
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
-        _cancel: &CancellationToken,
+        _cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        // TODO use Azure point in time recovery feature for this
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,15 +13,9 @@ mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
-mod support;

 use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-    time::SystemTime,
+    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
 };

 use anyhow::{bail, Context};
@@ -160,7 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None)
+            .list(prefix, ListingMode::WithDelimiter)
            .await?
            .prefixes;
        Ok(result)
@@ -176,17 +170,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys)
-            .await?
-            .keys;
+    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
        Ok(result)
    }

@@ -194,8 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        &self,
        prefix: Option<&RemotePath>,
        _mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Listing, DownloadError>;
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -234,7 +218,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError>;
 }

@@ -285,19 +269,6 @@ impl std::fmt::Display for DownloadError {

 impl std::error::Error for DownloadError {}

-impl DownloadError {
-    /// Returns true if the error should not be retried with backoff
-    pub fn is_permanent(&self) -> bool {
-        use DownloadError::*;
-        match self {
-            BadInput(_) => true,
-            NotFound => true,
-            Cancelled => true,
-            Other(_) => false,
-        }
-    }
-}
-
 #[derive(Debug)]
 pub enum TimeTravelError {
    /// Validation or other error happened due to user input.
@@ -353,31 +324,24 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
-            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
-            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
-            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
+            Self::LocalFs(s) => s.list(prefix, mode).await,
+            Self::AwsS3(s) => s.list(prefix, mode).await,
+            Self::AzureBlob(s) => s.list(prefix, mode).await,
+            Self::Unreliable(s) => s.list(prefix, mode).await,
        }
    }

    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::AzureBlob(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }

@@ -478,7 +442,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        match self {
            Self::LocalFs(s) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,9 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{
-    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
-};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};

 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -20,7 +18,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
+use crate::{
+    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
+};

 use super::{RemoteStorage, StorageMetadata};

@@ -164,7 +164,6 @@ impl RemoteStorage for LocalFs {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
        let mut result = Listing::default();

@@ -181,9 +180,6 @@ impl RemoteStorage for LocalFs {
                    !path.is_dir()
                })
                .collect();
-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }

            return Ok(result);
        }
@@ -369,33 +365,27 @@ impl RemoteStorage for LocalFs {
                    format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?;
-
-            let len = source
-                .metadata()
-                .await
-                .context("query file length")
-                .map_err(DownloadError::Other)?
-                .len();
-
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
                .context("Failed to seek to the range start in a local storage file")
                .map_err(DownloadError::Other)?;
-
            let metadata = self
                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;

-            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-            let source = ReaderStream::new(source);
-
+            let download_stream: DownloadStream = match end_exclusive {
+                Some(end_exclusive) => Box::pin(ReaderStream::new(
+                    source.take(end_exclusive - start_inclusive),
+                )),
+                None => Box::pin(ReaderStream::new(source)),
+            };
            Ok(Download {
                metadata,
                last_modified: None,
                etag: None,
-                download_stream: Box::pin(source),
+                download_stream,
            })
        } else {
            Err(DownloadError::NotFound)
@@ -441,7 +431,7 @@ impl RemoteStorage for LocalFs {
        _prefix: Option<&RemotePath>,
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
-        _cancel: &CancellationToken,
+        _cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
@@ -524,8 +514,10 @@ mod fs_tests {
    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

-    async fn read_and_check_metadata(
+    async fn read_and_assert_remote_file_contents(
        storage: &LocalFs,
+        #[allow(clippy::ptr_arg)]
+        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
@@ -604,7 +596,7 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

-        let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
+        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            contents,
@@ -626,7 +618,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        let full_range_download_contents =
-            read_and_check_metadata(&storage, &upload_target, None).await?;
+            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            full_range_download_contents,
@@ -668,22 +660,6 @@ mod fs_tests {
            "Second part bytes should be returned when requested"
        );

-        let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None)
-            .await?
-            .download_stream;
-        let suffix_bytes = aggregate(suffix_bytes).await?;
-        let suffix = std::str::from_utf8(&suffix_bytes)?;
-        assert_eq!(upload_name, suffix);
-
-        let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None)
-            .await?
-            .download_stream;
-        let all_bytes = aggregate(all_bytes).await?;
-        let all_bytes = std::str::from_utf8(&all_bytes)?;
-        assert_eq!(dummy_contents("upload_1"), all_bytes);
-
        Ok(())
    }

@@ -760,7 +736,7 @@ mod fs_tests {
            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;

        let full_range_download_contents =
-            read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
+            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
        assert_eq!(
            dummy_contents(upload_name),
            full_range_download_contents,
@@ -796,12 +772,12 @@ mod fs_tests {
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;

-        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
        assert!(listing.prefixes.is_empty());
        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;

        assert_eq!(
            listing.prefixes,
@@ -814,7 +790,6 @@ mod fs_tests {
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                ListingMode::WithDelimiter,
-                None,
            )
            .await?;
        assert_eq!(
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -7,7 +7,6 @@
 use std::{
    borrow::Cow,
    collections::HashMap,
-    num::NonZeroU32,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
@@ -46,9 +45,8 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
-    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -65,6 +63,7 @@ pub struct S3Bucket {
    concurrency_limiter: ConcurrencyLimiter,
 }

+#[derive(Default)]
 struct GetObjectRequest {
    bucket: String,
    key: String,
@@ -233,8 +232,24 @@ impl S3Bucket {

        let started_at = ScopeGuard::into_inner(started_at);

-        let object_output = match get_object {
-            Ok(object_output) => object_output,
+        match get_object {
+            Ok(object_output) => {
+                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                let etag = object_output.e_tag.clone();
+                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
+                Ok(Download {
+                    metadata,
+                    etag,
+                    last_modified,
+                    download_stream: Box::pin(body),
+                })
+            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
@@ -244,7 +259,7 @@ impl S3Bucket {
                    AttemptOutcome::Ok,
                    started_at,
                );
-                return Err(DownloadError::NotFound);
+                Err(DownloadError::NotFound)
            }
            Err(e) => {
                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
@@ -253,27 +268,11 @@ impl S3Bucket {
                    started_at,
                );

-                return Err(DownloadError::Other(
+                Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
-                ));
+                ))
            }
-        };
-
-        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output.e_tag;
-        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
-
-        let body = object_output.body;
-        let body = ByteStreamAsStream::from(body);
-        let body = PermitCarrying::new(permit, body);
-        let body = TimedDownload::new(started_at, body);
-
-        Ok(Download {
-            metadata,
-            etag,
-            last_modified,
-            download_stream: Box::pin(body),
-        })
+        }
    }

    async fn delete_oids(
@@ -355,6 +354,33 @@ impl Stream for ByteStreamAsStream {
    // sense and Stream::size_hint does not really
 }

+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct PermitCarrying<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S> PermitCarrying<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        Self { permit, inner }
+    }
+}
+
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
 pin_project_lite::pin_project! {
    /// Times and tracks the outcome of the request.
    struct TimedDownload<S> {
@@ -409,11 +435,8 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
-        // s3 sdk wants i32
-        let mut max_keys = max_keys.map(|mk| mk.get() as i32);
        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
@@ -437,20 +460,13 @@ impl RemoteStorage for S3Bucket {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            // min of two Options, returning Some if one is value and another is
-            // None (None is smaller than anything, so plain min doesn't work).
-            let request_max_keys = self
-                .max_keys_per_list_response
-                .into_iter()
-                .chain(max_keys.into_iter())
-                .min();
            let mut request = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(request_max_keys);
+                .set_max_keys(self.max_keys_per_list_response);

            if let ListingMode::WithDelimiter = mode {
                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
@@ -480,14 +496,6 @@ impl RemoteStorage for S3Bucket {
                let object_path = object.key().expect("response does not contain a key");
                let remote_path = self.s3_object_to_relative_path(object_path);
                result.keys.push(remote_path);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(result); // limit reached
-                    }
-                    max_keys = Some(mk);
-                }
            }

            result.prefixes.extend(
@@ -630,7 +638,7 @@ impl RemoteStorage for S3Bucket {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        let kind = RequestKind::TimeTravel;
        let _guard = self.permit(kind).await;
@@ -670,11 +678,9 @@ impl RemoteStorage for S3Bucket {
                warn_threshold,
                max_retries,
                "listing object versions for time_travel_recover",
-                cancel,
+                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
            )
-            .await
-            .ok_or_else(|| TimeTravelError::Cancelled)
-            .and_then(|x| x)?;
+            .await?;

            tracing::trace!(
                "  Got List response version_id_marker={:?}, key_marker={:?}",
@@ -799,11 +805,9 @@ impl RemoteStorage for S3Bucket {
                            warn_threshold,
                            max_retries,
                            "copying object version for time_travel_recover",
-                            cancel,
+                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
                        )
-                        .await
-                        .ok_or_else(|| TimeTravelError::Cancelled)
-                        .and_then(|x| x)?;
+                        .await?;
                        tracing::info!(%version_id, %key, "Copied old version in S3");
                    }
                    VerOrDelete {
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -4,7 +4,6 @@
 use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::HashMap;
-use std::num::NonZeroU32;
 use std::sync::Mutex;
 use std::time::SystemTime;
 use std::{collections::hash_map::Entry, sync::Arc};
@@ -61,7 +60,7 @@ impl UnreliableWrapper {
    /// On the first attempts of this operation, return an error. After 'attempts_to_fail'
    /// attempts, let the operation go ahead, and clear the counter.
    ///
-    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
+    fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
        let mut attempts = self.attempts.lock().unwrap();

        match attempts.entry(op) {
@@ -79,13 +78,13 @@ impl UnreliableWrapper {
                } else {
                    let error =
                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
-                    Err(error)
+                    Err(DownloadError::Other(error))
                }
            }
            Entry::Vacant(e) => {
                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                e.insert(1);
-                Err(error)
+                Err(DownloadError::Other(error))
            }
        }
    }
@@ -106,30 +105,22 @@ impl RemoteStorage for UnreliableWrapper {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
        self.inner.list_prefixes(prefix).await
    }

-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys).await
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
+        self.inner.list_files(folder).await
    }

    async fn list(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
    ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode, max_keys).await
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list(prefix, mode).await
    }

    async fn upload(
@@ -146,8 +137,7 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::Download(from.clone()))?;
        self.inner.download(from).await
    }

@@ -160,8 +150,7 @@ impl RemoteStorage for UnreliableWrapper {
        // Note: We treat any download_byte_range as an "attempt" of the same
        // operation. We don't pay attention to the ranges. That's good enough
        // for now.
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
+        self.attempt(RemoteOp::Download(from.clone()))?;
        self.inner
            .download_byte_range(from, start_inclusive, end_exclusive)
            .await
@@ -201,10 +190,10 @@ impl RemoteStorage for UnreliableWrapper {
        prefix: Option<&RemotePath>,
        timestamp: SystemTime,
        done_if_after: SystemTime,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<(), TimeTravelError> {
        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
-            .map_err(TimeTravelError::Other)?;
+            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
        self.inner
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,33 +0,0 @@
-use std::{
-    pin::Pin,
-    task::{Context, Poll},
-};
-
-use futures_util::Stream;
-
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    pub(crate) struct PermitCarrying<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S> PermitCarrying<S> {
-    pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        Self { permit, inner }
-    }
-}
-
-impl<S: Stream> Stream for PermitCarrying<S> {
-    type Item = <S as Stream>::Item;
-
-    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        self.project().inner.poll_next(cx)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,8 +1,8 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::RemotePath;
+use std::collections::HashSet;
 use std::sync::Arc;
-use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
 use tracing::debug;

@@ -103,7 +103,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None)
+        .list_files(None)
        .await
        .context("client list root files failure")?
        .into_iter()
@@ -113,17 +113,8 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        ctx.remote_blobs.clone(),
        "remote storage list_files on root mismatches with the uploads."
    );
-
-    // Test that max_keys limit works. In total there are about 21 files (see
-    // upload_simple_remote_data call in test_real_s3.rs).
-    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
-        .await
-        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
-
    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None)
+        .list_files(Some(&base_prefix))
        .await
        .context("client list nested files failure")?
        .into_iter()
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -56,10 +56,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
            warn_threshold,
            max_retries,
            "test retry",
-            &CancellationToken::new(),
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
        )
        .await
-        .expect("never cancelled")
    }

    async fn time_point() -> SystemTime {
@@ -70,15 +69,13 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    }

    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None))
+        Ok(retry(|| client.list_files(None))
            .await
            .context("list root files failure")?
            .into_iter()
            .collect::<HashSet<_>>())
    }

-    let cancel = CancellationToken::new();
-
    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

@@ -145,7 +142,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // No changes after recovery to t2 (no-op)
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t2, t_final, &cancel)
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
        .await?;
    let t2_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t2: {t2_files_recovered:?}");
@@ -156,7 +153,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t1: path1 is back, path2 has the old content
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t1, t_final, &cancel)
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
        .await?;
    let t1_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t1: {t1_files_recovered:?}");
@@ -167,7 +164,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t0: everything is gone except for path1
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t0, t_final, &cancel)
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
        .await?;
    let t0_files_recovered = list_files(&ctx.client).await?;
    println!("after recovery to t0: {t0_files_recovered:?}");
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -37,53 +37,69 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
    }
 }

-/// Retries passed operation until one of the following conditions are met:
-/// - encountered error is considered as permanent (non-retryable)
-/// - retries have been exhausted
-/// - cancellation token has been cancelled
-///
-/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent
-/// errors. When attempts cross `warn_threshold` function starts to emit log warnings.
+/// Configure cancellation for a retried operation: when to cancel (the token), and
+/// what kind of error to return on cancellation
+pub struct Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    token: CancellationToken,
+    on_cancel: CF,
+}
+
+impl<E, CF> Cancel<E, CF>
+where
+    E: Display + Debug + 'static,
+    CF: Fn() -> E,
+{
+    pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
+        Self { token, on_cancel }
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
 /// `description` argument is added to log messages. Its value should identify the `op` is doing
-/// `cancel` cancels new attempts and the backoff sleep.
-///
-/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work
-/// for any other error type. Final failed attempt is logged with `{:?}`.
-///
-/// Returns `None` if cancellation was noticed during backoff or the terminal result.
-pub async fn retry<T, O, F, E>(
+/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
+/// to drop out promptly on shutdown.
+pub async fn retry<T, O, F, E, CF>(
    mut op: O,
    is_permanent: impl Fn(&E) -> bool,
    warn_threshold: u32,
    max_retries: u32,
    description: &str,
-    cancel: &CancellationToken,
-) -> Option<Result<T, E>>
+    cancel: Cancel<E, CF>,
+) -> Result<T, E>
 where
    // Not std::error::Error because anyhow::Error doesnt implement it.
    // For context see https://github.com/dtolnay/anyhow/issues/63
    E: Display + Debug + 'static,
    O: FnMut() -> F,
    F: Future<Output = Result<T, E>>,
+    CF: Fn() -> E,
 {
    let mut attempts = 0;
    loop {
-        if cancel.is_cancelled() {
-            return None;
+        if cancel.token.is_cancelled() {
+            return Err((cancel.on_cancel)());
        }

        let result = op().await;
-        match &result {
+        match result {
            Ok(_) => {
                if attempts > 0 {
                    tracing::info!("{description} succeeded after {attempts} retries");
                }
-                return Some(result);
+                return result;
            }

            // These are "permanent" errors that should not be retried.
-            Err(e) if is_permanent(e) => {
-                return Some(result);
+            Err(ref e) if is_permanent(e) => {
+                return result;
            }
            // Assume that any other failure might be transient, and the operation might
            // succeed if we just keep trying.
@@ -93,12 +109,12 @@ where
            Err(err) if attempts < max_retries => {
                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
            }
-            Err(err) => {
+            Err(ref err) => {
                // Operation failed `max_attempts` times. Time to give up.
                tracing::warn!(
                    "{description} still failed after {attempts} retries, giving up: {err:?}"
                );
-                return Some(result);
+                return result;
            }
        }
        // sleep and retry
@@ -106,7 +122,7 @@ where
            attempts,
            DEFAULT_BASE_BACKOFF_SECONDS,
            DEFAULT_MAX_BACKOFF_SECONDS,
-            cancel,
+            &cancel.token,
        )
        .await;
        attempts += 1;
@@ -115,10 +131,12 @@ where

 #[cfg(test)]
 mod tests {
-    use super::*;
    use std::io;
+
    use tokio::sync::Mutex;

+    use super::*;
+
    #[test]
    fn backoff_defaults_produce_growing_backoff_sequence() {
        let mut current_backoff_value = None;
@@ -148,7 +166,7 @@ mod tests {
    #[tokio::test(start_paused = true)]
    async fn retry_always_error() {
        let count = Mutex::new(0);
-        retry(
+        let err_result = retry(
            || async {
                *count.lock().await += 1;
                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
@@ -157,11 +175,11 @@ mod tests {
            1,
            1,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
-        .await
-        .expect("not cancelled")
-        .expect_err("it can only fail");
+        .await;
+
+        assert!(err_result.is_err());

        assert_eq!(*count.lock().await, 2);
    }
@@ -183,11 +201,10 @@ mod tests {
            2,
            2,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
-        .expect("not cancelled")
-        .expect("success on second try");
+        .unwrap();
    }

    #[tokio::test(start_paused = true)]
@@ -207,11 +224,10 @@ mod tests {
            2,
            2,
            "work",
-            &CancellationToken::new(),
+            Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
        )
        .await
-        .expect("was not cancellation")
-        .expect_err("it was permanent error");
+        .unwrap_err();

        assert_eq!(*count.lock().await, 1);
    }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -27,11 +27,6 @@ impl Barrier {
            b.wait().await
        }
    }
-
-    /// Return true if a call to wait() would complete immediately
-    pub fn is_ready(&self) -> bool {
-        futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
-    }
 }

 impl PartialEq for Barrier {
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -20,13 +20,13 @@
 //!
 //! // Then, in the main code:
 //!
-//! let span = tracing::info_span!("TestSpan", tenant_id = 1);
+//! let span = tracing::info_span!("TestSpan", test_id = 1);
 //! let _guard = span.enter();
 //!
 //! // ... down the call stack
 //!
-//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor};
-//! let extractor = ConstExtractor::new("tenant_id");
+//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
 //! if let Err(missing) = check_fields_present!([&extractor]) {
 //!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
 //!    // to get the "user" code location for the panic.
@@ -45,26 +45,27 @@ pub enum ExtractionResult {
 }

 pub trait Extractor: Send + Sync + std::fmt::Debug {
-    fn id(&self) -> &str;
+    fn name(&self) -> &str;
    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
 }

 #[derive(Debug)]
-pub struct ConstExtractor {
-    field_name: &'static str,
+pub struct MultiNameExtractor<const L: usize> {
+    name: &'static str,
+    field_names: [&'static str; L],
 }

-impl ConstExtractor {
-    pub const fn new(field_name: &'static str) -> ConstExtractor {
-        ConstExtractor { field_name }
+impl<const L: usize> MultiNameExtractor<L> {
+    pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
+        MultiNameExtractor { name, field_names }
    }
 }
-impl Extractor for ConstExtractor {
-    fn id(&self) -> &str {
-        self.field_name
+impl<const L: usize> Extractor for MultiNameExtractor<L> {
+    fn name(&self) -> &str {
+        self.name
    }
    fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
-        if fields.iter().any(|f| f.name() == self.field_name) {
+        if fields.iter().any(|f| self.field_names.contains(&f.name())) {
            ExtractionResult::Present
        } else {
            ExtractionResult::Absent
@@ -202,19 +203,19 @@ mod tests {
    }
    impl<'a> fmt::Debug for MemoryIdentity<'a> {
        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
+            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
        }
    }

    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
-        tenant_extractor: ConstExtractor,
-        timeline_extractor: ConstExtractor,
+        tenant_extractor: MultiNameExtractor<2>,
+        timeline_extractor: MultiNameExtractor<2>,
    }

    fn setup_current_thread() -> Setup {
-        let tenant_extractor = ConstExtractor::new("tenant_id");
-        let timeline_extractor = ConstExtractor::new("timeline_id");
+        let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
+        let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);

        let registry = tracing_subscriber::registry()
            .with(tracing_subscriber::fmt::layer())
@@ -342,12 +343,12 @@ mod tests {
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractor = ConstExtractor::new("e");
+        let extractor = MultiNameExtractor::new("E", ["e"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");

        // similarly for a not found key
-        let extractor = ConstExtractor::new("foobar");
+        let extractor = MultiNameExtractor::new("F", ["foobar"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
    }
@@ -367,14 +368,16 @@ mod tests {
        // normally this would work, but without any tracing-subscriber configured, both
        // check_field_present find nothing
        let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] =
-            [&ConstExtractor::new("e"), &ConstExtractor::new("f")];
+        let extractors: [&dyn Extractor; 2] = [
+            &MultiNameExtractor::new("E", ["e"]),
+            &MultiNameExtractor::new("F", ["f"]),
+        ];

        let res = check_fields_present0(extractors);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");

        // similarly for a not found key
-        let extractor = ConstExtractor::new("g");
+        let extractor = MultiNameExtractor::new("G", ["g"]);
        let res = check_fields_present0([&extractor]);
        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
    }
@@ -407,7 +410,7 @@ mod tests {
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")];
+        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];

        if span.is_disabled() {
            // the tests are running single threaded, or we got lucky and no other tests subscriber
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -453,12 +453,9 @@ mod tests {
                event_mask: 0,
            }),
            expected_messages: vec![
-                // TODO: When updating Postgres versions, this test will cause
-                // problems. Postgres version in message needs updating.
-                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -56,18 +56,10 @@ pub enum ForceAwaitLogicalSize {

 impl Client {
    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
-        Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
-    }
-
-    pub fn from_client(
-        client: reqwest::Client,
-        mgmt_api_endpoint: String,
-        jwt: Option<&str>,
-    ) -> Self {
        Self {
            mgmt_api_endpoint,
            authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
-            client,
+            client: reqwest::Client::new(),
        }
    }

@@ -318,22 +310,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn tenant_shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        req: TenantShardSplitRequest,
-    ) -> Result<TenantShardSplitResponse> {
-        let uri = format!(
-            "{}/v1/tenant/{}/shard_split",
-            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        self.request(Method::PUT, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn timeline_list(
        &self,
        tenant_shard_id: &TenantShardId,
@@ -363,16 +339,4 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
-
-    pub async fn put_io_engine(
-        &self,
-        engine: &pageserver_api::models::virtual_file::IoEngineKind,
-    ) -> Result<()> {
-        let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, engine)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
 }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -51,10 +51,6 @@ pub(crate) struct Args {
    /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
    #[clap(long)]
    keyspace_cache: Option<Utf8PathBuf>,
-    /// Before starting the benchmark, live-reconfigure the pageserver to use the given
-    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
-    #[clap(long)]
-    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -83,12 +79,6 @@ impl KeyRange {
    }
 }

-#[derive(PartialEq, Eq, Hash, Copy, Clone)]
-struct WorkerId {
-    timeline: TenantTimelineId,
-    num_client: usize, // from 0..args.num_clients
-}
-
 #[derive(serde::Serialize)]
 struct Output {
    total: request_stats::Output,
@@ -113,10 +103,6 @@ async fn main_impl(
        args.pageserver_jwt.as_deref(),
    ));

-    if let Some(engine_str) = &args.set_io_engine {
-        mgmt_api_client.put_io_engine(engine_str).await?;
-    }
-
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
@@ -220,7 +206,7 @@ async fn main_impl(

    let live_stats = Arc::new(LiveStats::default());

-    let num_client_tasks = args.num_clients.get() * timelines.len();
+    let num_client_tasks = timelines.len();
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = 1;
    let num_main_impl = 1;
@@ -249,25 +235,19 @@ async fn main_impl(

    let cancel = CancellationToken::new();

-    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
+    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
    let mut tasks = Vec::new();
-    for timeline in timelines.iter().cloned() {
-        for num_client in 0..args.num_clients.get() {
-            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-            let worker_id = WorkerId {
-                timeline,
-                num_client,
-            };
-            work_senders.insert(worker_id, sender);
-            tasks.push(tokio::spawn(client(
-                args,
-                worker_id,
-                Arc::clone(&start_work_barrier),
-                receiver,
-                Arc::clone(&live_stats),
-                cancel.clone(),
-            )));
-        }
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+        work_senders.insert(*tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&live_stats),
+            cancel.clone(),
+        )));
    }

    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
@@ -291,10 +271,7 @@ async fn main_impl(
                        let (rel_tag, block_no) =
                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                        (
-                            WorkerId {
-                                timeline: r.timeline,
-                                num_client: rng.gen_range(0..args.num_clients.get()),
-                            },
+                            r.timeline,
                            PagestreamGetPageRequest {
                                latest: rng.gen_bool(args.req_latest_probability),
                                lsn: r.timeline_lsn,
@@ -312,54 +289,56 @@ async fn main_impl(
            }),
            Some(rps_limit) => Box::pin(async move {
                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
-                    &|worker_id| {
-                        let sender = work_senders.get(&worker_id).unwrap();
-                        let ranges: Vec<KeyRange> = all_ranges
-                            .iter()
-                            .filter(|r| r.timeline == worker_id.timeline)
-                            .cloned()
-                            .collect();
-                        let weights = rand::distributions::weighted::WeightedIndex::new(
-                            ranges.iter().map(|v| v.len()),
-                        )
-                        .unwrap();
+                let make_timeline_task: &dyn Fn(
+                    TenantTimelineId,
+                )
+                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                    let sender = work_senders.get(&timeline).unwrap();
+                    let ranges: Vec<KeyRange> = all_ranges
+                        .iter()
+                        .filter(|r| r.timeline == timeline)
+                        .cloned()
+                        .collect();
+                    let weights = rand::distributions::weighted::WeightedIndex::new(
+                        ranges.iter().map(|v| v.len()),
+                    )
+                    .unwrap();

-                        let cancel = cancel.clone();
-                        Box::pin(async move {
-                            let mut ticker = tokio::time::interval(period);
-                            ticker.set_missed_tick_behavior(
-                                /* TODO review this choice */
-                                tokio::time::MissedTickBehavior::Burst,
-                            );
-                            while !cancel.is_cancelled() {
-                                ticker.tick().await;
-                                let req = {
-                                    let mut rng = rand::thread_rng();
-                                    let r = &ranges[weights.sample(&mut rng)];
-                                    let key: i128 = rng.gen_range(r.start..r.end);
-                                    let key = Key::from_i128(key);
-                                    assert!(is_rel_block_key(&key));
-                                    let (rel_tag, block_no) = key_to_rel_block(key)
-                                        .expect("we filter non-rel-block keys out above");
-                                    PagestreamGetPageRequest {
-                                        latest: rng.gen_bool(args.req_latest_probability),
-                                        lsn: r.timeline_lsn,
-                                        rel: rel_tag,
-                                        blkno: block_no,
-                                    }
-                                };
-                                if sender.send(req).await.is_err() {
-                                    assert!(
-                                        cancel.is_cancelled(),
-                                        "client has gone away unexpectedly"
-                                    );
+                    let cancel = cancel.clone();
+                    Box::pin(async move {
+                        let mut ticker = tokio::time::interval(period);
+                        ticker.set_missed_tick_behavior(
+                            /* TODO review this choice */
+                            tokio::time::MissedTickBehavior::Burst,
+                        );
+                        while !cancel.is_cancelled() {
+                            ticker.tick().await;
+                            let req = {
+                                let mut rng = rand::thread_rng();
+                                let r = &ranges[weights.sample(&mut rng)];
+                                let key: i128 = rng.gen_range(r.start..r.end);
+                                let key = Key::from_i128(key);
+                                assert!(is_rel_block_key(&key));
+                                let (rel_tag, block_no) = key_to_rel_block(key)
+                                    .expect("we filter non-rel-block keys out above");
+                                PagestreamGetPageRequest {
+                                    latest: rng.gen_bool(args.req_latest_probability),
+                                    lsn: r.timeline_lsn,
+                                    rel: rel_tag,
+                                    blkno: block_no,
                                }
+                            };
+                            if sender.send(req).await.is_err() {
+                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                            }
-                        })
-                    };
+                        }
+                    })
+                };

-                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
+                let tasks: Vec<_> = work_senders
+                    .keys()
+                    .map(|tl| make_timeline_task(*tl))
+                    .collect();

                start_work_barrier.wait().await;

@@ -411,16 +390,12 @@ async fn main_impl(
 #[instrument(skip_all)]
 async fn client(
    args: &'static Args,
-    id: WorkerId,
+    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
    live_stats: Arc<LiveStats>,
    cancel: CancellationToken,
 ) {
-    let WorkerId {
-        timeline,
-        num_client: _,
-    } = id;
    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
        .await
        .unwrap();
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -272,12 +272,6 @@ fn start_pageserver(
    );
    set_build_info_metric(GIT_VERSION, BUILD_TAG);
    set_launch_timestamp_metric(launch_ts);
-    #[cfg(target_os = "linux")]
-    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    metrics::register_internal(Box::new(
-        pageserver::metrics::tokio_epoll_uring::Collector::new(),
-    ))
-    .unwrap();
    pageserver::preinitialize_metrics();

    // If any failpoints were set from FAILPOINTS environment variable,
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -262,33 +262,35 @@ async fn upload(
 ) -> Result<(), UploadError> {
    let warn_after = 3;
    let max_attempts = 10;
-
-    // this is used only with tests so far
-    let last_value = if is_last { "true" } else { "false" };
-
    let res = utils::backoff::retry(
-        || async {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .header(reqwest::header::CONTENT_TYPE, "application/json")
-                .header(LAST_IN_BATCH.clone(), last_value)
-                .body(body.clone())
-                .send()
-                .await;
+        move || {
+            let body = body.clone();
+            async move {
+                let res = client
+                    .post(metric_collection_endpoint.clone())
+                    .header(reqwest::header::CONTENT_TYPE, "application/json")
+                    .header(
+                        LAST_IN_BATCH.clone(),
+                        if is_last { "true" } else { "false" },
+                    )
+                    .body(body)
+                    .send()
+                    .await;

-            let res = res.and_then(|res| res.error_for_status());
+                let res = res.and_then(|res| res.error_for_status());

-            // 10 redirects are normally allowed, so we don't need worry about 3xx
-            match res {
-                Ok(_response) => Ok(()),
-                Err(e) => {
-                    let status = e.status().filter(|s| s.is_client_error());
-                    if let Some(status) = status {
-                        // rejection used to be a thing when the server could reject a
-                        // whole batch of metrics if one metric was bad.
-                        Err(UploadError::Rejected(status))
-                    } else {
-                        Err(UploadError::Reqwest(e))
+                // 10 redirects are normally allowed, so we don't need worry about 3xx
+                match res {
+                    Ok(_response) => Ok(()),
+                    Err(e) => {
+                        let status = e.status().filter(|s| s.is_client_error());
+                        if let Some(status) = status {
+                            // rejection used to be a thing when the server could reject a
+                            // whole batch of metrics if one metric was bad.
+                            Err(UploadError::Rejected(status))
+                        } else {
+                            Err(UploadError::Reqwest(e))
+                        }
                    }
                }
            }
@@ -297,11 +299,9 @@ async fn upload(
        warn_after,
        max_attempts,
        "upload consumption_metrics",
-        cancel,
+        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
    )
-    .await
-    .ok_or_else(|| UploadError::Cancelled)
-    .and_then(|x| x);
+    .await;

    match &res {
        Ok(_) => {}
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -82,29 +82,46 @@ impl ControlPlaneClient {
        R: Serialize,
        T: DeserializeOwned,
    {
-        let res = backoff::retry(
+        #[derive(thiserror::Error, Debug)]
+        enum RemoteAttemptError {
+            #[error("shutdown")]
+            Shutdown,
+            #[error("remote: {0}")]
+            Remote(reqwest::Error),
+        }
+
+        match backoff::retry(
            || async {
                let response = self
                    .http_client
                    .post(url.clone())
                    .json(&request)
                    .send()
-                    .await?;
+                    .await
+                    .map_err(RemoteAttemptError::Remote)?;

-                response.error_for_status_ref()?;
-                response.json::<T>().await
+                response
+                    .error_for_status_ref()
+                    .map_err(RemoteAttemptError::Remote)?;
+                response
+                    .json::<T>()
+                    .await
+                    .map_err(RemoteAttemptError::Remote)
            },
            |_| false,
            3,
            u32::MAX,
            "calling control plane generation validation API",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
        )
        .await
-        .ok_or(RetryForeverError::ShuttingDown)?
-        .expect("We retry forever, this should never be reached");
-
-        Ok(res)
+        {
+            Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
+            Err(RemoteAttemptError::Remote(_)) => {
+                panic!("We retry forever, this should never be reached");
+            }
+            Ok(r) => Ok(r),
+        }
    }
 }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -700,6 +700,8 @@ impl DeletionQueue {
    }

    pub async fn shutdown(&mut self, timeout: Duration) {
+        self.cancel.cancel();
+
        match tokio::time::timeout(timeout, self.client.flush()).await {
            Ok(Ok(())) => {
                tracing::info!("Deletion queue flushed successfully on shutdown")
@@ -713,10 +715,6 @@ impl DeletionQueue {
                tracing::warn!("Timed out flushing deletion queue on shutdown")
            }
        }
-
-        // We only cancel _after_ flushing: otherwise we would be shutting down the
-        // components that do the flush.
-        self.cancel.cancel();
    }
 }

--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -77,11 +77,9 @@ impl Deleter {
            3,
            10,
            "executing deletion batch",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Shutting down"))
-        .and_then(|x| x)
    }

    /// Block until everything in accumulator has been executed
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -623,7 +623,6 @@ impl std::fmt::Display for EvictionLayer {
    }
 }

-#[derive(Default)]
 pub(crate) struct DiskUsageEvictionInfo {
    /// Timeline's largest layer (remote or resident)
    pub max_layer_size: Option<u64>,
@@ -855,27 +854,19 @@ async fn collect_eviction_candidates(

        let total = tenant_candidates.len();

-        let tenant_candidates =
-            tenant_candidates
-                .into_iter()
-                .enumerate()
-                .map(|(i, mut candidate)| {
-                    // as we iterate this reverse sorted list, the most recently accessed layer will always
-                    // be 1.0; this is for us to evict it last.
-                    candidate.relative_last_activity =
-                        eviction_order.relative_last_activity(total, i);
+        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
+            // as we iterate this reverse sorted list, the most recently accessed layer will always
+            // be 1.0; this is for us to evict it last.
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);

-                    let partition = if cumsum > min_resident_size as i128 {
-                        MinResidentSizePartition::Above
-                    } else {
-                        MinResidentSizePartition::Below
-                    };
-                    cumsum += i128::from(candidate.layer.get_file_size());
-
-                    (partition, candidate)
-                });
-
-        candidates.extend(tenant_candidates);
+            let partition = if cumsum > min_resident_size as i128 {
+                MinResidentSizePartition::Above
+            } else {
+                MinResidentSizePartition::Below
+            };
+            cumsum += i128::from(candidate.layer.get_file_size());
+            candidates.push((partition, candidate));
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -891,41 +882,21 @@ async fn collect_eviction_candidates(
    );

    for secondary_tenant in secondary_tenants {
-        // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
-        // to prevent repeated disk usage based evictions from completely draining less often
-        // updating secondaries.
-        let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
-
-        debug_assert!(
-            total_layers >= layer_info.resident_layers.len(),
-            "total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
-            layer_info.resident_layers.len()
-        );
+        let mut layer_info = secondary_tenant.get_layers_for_eviction();

        layer_info
            .resident_layers
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));

-        let tenant_candidates =
-            layer_info
-                .resident_layers
-                .into_iter()
-                .enumerate()
-                .map(|(i, mut candidate)| {
-                    candidate.relative_last_activity =
-                        eviction_order.relative_last_activity(total_layers, i);
-                    (
-                        // Secondary locations' layers are always considered above the min resident size,
-                        // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                        // the layers have sufficiently old access times.
-                        MinResidentSizePartition::Above,
-                        candidate,
-                    )
-                });
-
-        candidates.extend(tenant_candidates);
-
-        tokio::task::yield_now().await;
+        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
+            (
+                // Secondary locations' layers are always considered above the min resident size,
+                // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                // the layers have sufficiently old access times.
+                MinResidentSizePartition::Above,
+                candidate,
+            )
+        }));
    }

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,14 +19,11 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
 use pageserver_api::models::TenantShardLocation;
-use pageserver_api::models::TenantShardSplitRequest;
-use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
-use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
@@ -492,12 +489,6 @@ async fn timeline_create_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
-            tracing::info!(%ancestor_id, "starting to branch");
-        } else {
-            tracing::info!("bootstrapping");
-        }
-
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -538,7 +529,7 @@ async fn timeline_create_handler(
    }
    .instrument(info_span!("timeline_create",
        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
+        shard = %tenant_shard_id.shard_slug(),
        timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }
@@ -834,7 +825,7 @@ async fn timeline_delete_handler(
            }
        })?;
    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
+    tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
        .await?;

    json_response(StatusCode::ACCEPTED, ())
@@ -859,7 +850,7 @@ async fn tenant_detach_handler(
        detach_ignored.unwrap_or(false),
        &state.deletion_queue_client,
    )
-    .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
+    .instrument(info_span!("tenant_detach", %tenant_id))
    .await?;

    json_response(StatusCode::OK, ())
@@ -878,7 +869,7 @@ async fn tenant_reset_handler(
    let state = get_state(&request);
    state
        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1010,7 +1001,7 @@ async fn tenant_delete_handler(
        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard_id = %tenant_shard_id.shard_slug()
+            shard = %tenant_shard_id.shard_slug()
        ))
        .await?;

@@ -1107,25 +1098,6 @@ async fn tenant_size_handler(
    )
 }

-async fn tenant_shard_split_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let req: TenantShardSplitRequest = json_request(&mut request).await?;
-
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let state = get_state(&request);
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let new_shards = state
-        .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
-}
-
 async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1385,7 +1357,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
+                    shard = %tenant_shard_id.shard_slug()
                ))
                .await
        {
@@ -1930,15 +1902,6 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_engine_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
-    crate::virtual_file::io_engine::set(kind);
-    json_response(StatusCode::OK, ())
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2085,9 +2048,6 @@ pub fn make_router(
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
-        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
-            api_handler(r, tenant_shard_split_handler)
-        })
        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
@@ -2199,6 +2159,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
        )
-        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -17,7 +17,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
-pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2400,72 +2400,6 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
    }
 }

-pub mod tokio_epoll_uring {
-    use metrics::UIntGauge;
-
-    pub struct Collector {
-        descs: Vec<metrics::core::Desc>,
-        systems_created: UIntGauge,
-        systems_destroyed: UIntGauge,
-    }
-
-    const NMETRICS: usize = 2;
-
-    impl metrics::core::Collector for Collector {
-        fn desc(&self) -> Vec<&metrics::core::Desc> {
-            self.descs.iter().collect()
-        }
-
-        fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
-            let mut mfs = Vec::with_capacity(NMETRICS);
-            let tokio_epoll_uring::metrics::Metrics {
-                systems_created,
-                systems_destroyed,
-            } = tokio_epoll_uring::metrics::global();
-            self.systems_created.set(systems_created);
-            mfs.extend(self.systems_created.collect());
-            self.systems_destroyed.set(systems_destroyed);
-            mfs.extend(self.systems_destroyed.collect());
-            mfs
-        }
-    }
-
-    impl Collector {
-        #[allow(clippy::new_without_default)]
-        pub fn new() -> Self {
-            let mut descs = Vec::new();
-
-            let systems_created = UIntGauge::new(
-                "pageserver_tokio_epoll_uring_systems_created",
-                "counter of tokio-epoll-uring systems that were created",
-            )
-            .unwrap();
-            descs.extend(
-                metrics::core::Collector::desc(&systems_created)
-                    .into_iter()
-                    .cloned(),
-            );
-
-            let systems_destroyed = UIntGauge::new(
-                "pageserver_tokio_epoll_uring_systems_destroyed",
-                "counter of tokio-epoll-uring systems that were destroyed",
-            )
-            .unwrap();
-            descs.extend(
-                metrics::core::Collector::desc(&systems_destroyed)
-                    .into_iter()
-                    .cloned(),
-            );
-
-            Self {
-                descs,
-                systems_created,
-                systems_destroyed,
-            }
-        }
-    }
-}
-
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -63,10 +63,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::pgdatadir_mapping::Version;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
@@ -91,8 +90,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// `tokio_tar` already read the first such block. Read the second all-zeros block,
 /// and check that there is no more data after the EOF marker.
 ///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
+/// XXX: Currently, any trailing data after the EOF marker prints a warning.
+/// Perhaps it should be a hard error?
 async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
    use tokio::io::AsyncReadExt;
    let mut buf = [0u8; 512];
@@ -113,24 +112,17 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
        anyhow::bail!("invalid tar EOF marker");
    }

-    // Drain any extra zero-blocks after the EOF marker
+    // Drain any data after the EOF marker
    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
    loop {
        let nbytes = reader.read(&mut buf).await?;
        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
        if nbytes == 0 {
            break;
        }
    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    if trailing_bytes > 0 {
+        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
    }
    Ok(())
 }
@@ -557,7 +549,7 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
@@ -639,7 +631,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
@@ -728,7 +719,7 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        // Create empty timeline
        info!("creating new timeline");
@@ -781,7 +772,7 @@ impl PageServerHandler {
        Ok(())
    }

-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
+    #[instrument(skip_all, fields(%start_lsn, %end_lsn))]
    async fn handle_import_wal<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -794,6 +785,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let timeline = self
            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
@@ -900,7 +893,6 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_rel_exists_request(
        &mut self,
        tenant_id: TenantId,
@@ -927,7 +919,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_nblocks_request(
        &mut self,
        tenant_id: TenantId,
@@ -955,7 +946,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_db_size_request(
        &mut self,
        tenant_id: TenantId,
@@ -1106,7 +1096,6 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_page_at_lsn_request(
        &mut self,
        tenant_id: TenantId,
@@ -1140,9 +1129,6 @@ impl PageServerHandler {
            }
        };

-        // load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
-        set_tracing_field_shard_id(timeline);
-
        let _timer = timeline
            .query_metrics
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
@@ -1161,7 +1147,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_slru_segment_request(
        &mut self,
        tenant_id: TenantId,
@@ -1190,7 +1175,7 @@ impl PageServerHandler {
    }

    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
+    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
@@ -1205,6 +1190,8 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1326,7 +1313,6 @@ impl PageServerHandler {
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
        let timeline = tenant.get_timeline(timeline_id, true)?;
-        set_tracing_field_shard_id(&timeline);
        Ok(timeline)
    }
 }
@@ -1491,29 +1477,21 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .await?;

-                let end_of_timeline = timeline.get_last_record_rlsn();
+            let end_of_timeline = timeline.get_last_record_rlsn();

-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
+                RowDescriptor::text_col(b"prev_lsn"),
+                RowDescriptor::text_col(b"last_lsn"),
+            ]))?
+            .write_message_noflush(&BeMessage::DataRow(&[
+                Some(end_of_timeline.prev.to_string().as_bytes()),
+                Some(end_of_timeline.last.to_string().as_bytes()),
+            ]))?
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // same as basebackup, but result includes relational data as well
        else if query_string.starts_with("fullbackup ") {
@@ -1770,12 +1748,3 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
-
-fn set_tracing_field_shard_id(timeline: &Timeline) {
-    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(timeline.tenant_shard_id.shard_slug()),
-    );
-    debug_assert_current_span_has_tenant_and_timeline_id();
-}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,7 +10,6 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
@@ -700,7 +699,7 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,9 @@ impl Value {
    }
 }

+/// The maximum size of a value supported by the pageserver
+pub const MAX_VALUE_SIZE: usize = 10_000_000;
+
 #[cfg(test)]
 mod test {
    use super::*;
--- a/pageserver/src/span.rs
+++ b/pageserver/src/span.rs
@@ -1,43 +0,0 @@
-use utils::tracing_span_assert::check_fields_present;
-
-mod extractors {
-    use utils::tracing_span_assert::ConstExtractor;
-
-    pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id");
-    pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id");
-    pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id");
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID])
-        {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) = check_fields_present!([
-            &extractors::TENANT_ID,
-            &extractors::SHARD_ID,
-            &extractors::TIMELINE_ID,
-        ]) {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
-
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() {
-    if cfg!(debug_assertions) {
-        if let Err(missing) =
-            check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,])
-        {
-            panic!("missing extractors: {missing:?}")
-        }
-    }
-}
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -576,8 +576,8 @@ pub fn shutdown_token() -> CancellationToken {

 /// Has the current task been requested to shut down?
 pub fn is_shutdown_requested() -> bool {
-    if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
-        true_or_false
+    if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
+        cancel.is_cancelled()
    } else {
        if !cfg!(test) {
            warn!("is_shutdown_requested() called in an unexpected task or thread");
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -53,7 +53,6 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
-use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
@@ -68,9 +67,7 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::metrics::TENANT;
-use crate::metrics::{
-    remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
-};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -101,7 +98,6 @@ use std::sync::Arc;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

-use crate::span;
 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
@@ -152,6 +148,7 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
+mod span;

 pub mod metadata;
 mod par_fsync;
@@ -169,7 +166,7 @@ pub(crate) mod timeline;

 pub mod size;

-pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

 // re-export for use in remote_timeline_client.rs
@@ -208,7 +205,7 @@ impl AttachedTenantConf {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => Ok(Self {
                tenant_conf: location_conf.tenant_conf,
-                location: *attach_conf,
+                location: attach_conf.clone(),
            }),
            LocationMode::Secondary(_) => {
                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
@@ -279,7 +276,7 @@ pub struct Tenant {
    // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
    // timeout...
    gc_cs: tokio::sync::Mutex<()>,
-    walredo_mgr: Option<Arc<WalRedoManager>>,
+    walredo_mgr: Arc<WalRedoManager>,

    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: Option<GenericRemoteStorage>,
@@ -628,15 +625,12 @@ impl Tenant {
            deletion_queue_client,
        } = resources;

-        let attach_mode = attached_conf.location.attach_mode;
-        let generation = attached_conf.location.generation;
-
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
            conf,
            attached_conf,
            shard_identity,
-            Some(wal_redo_manager),
+            wal_redo_manager,
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
@@ -660,12 +654,6 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
-
-                info!(
-                    ?attach_mode,
-                    "Attaching tenant"
-                );
-
                let _gate_guard = attach_gate_guard;

                // Is this tenant being spawned as part of process startup?
@@ -877,7 +865,7 @@ impl Tenant {
                Ok(())
            }
            .instrument({
-                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
+                let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
                span.follows_from(Span::current());
                span
            }),
@@ -1196,6 +1184,10 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
                reason,
@@ -1206,7 +1198,7 @@ impl Tenant {
            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
            // to occupy the slot for this TenantShardId.
            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
-            None,
+            wal_redo_manager,
            tenant_shard_id,
            None,
            DeletionQueueClient::broken(),
@@ -1377,7 +1369,7 @@ impl Tenant {
                async move {
                    debug!("starting index part download");

-                    let index_part = client.download_index_file(&cancel_clone).await;
+                    let index_part = client.download_index_file(cancel_clone).await;

                    debug!("finished index part download");

@@ -1975,7 +1967,7 @@ impl Tenant {
    }

    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
-        self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
+        self.walredo_mgr.status()
    }

    /// Changes tenant status to active, unless shutdown was already requested.
@@ -2362,7 +2354,12 @@ impl Tenant {
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf
+            .read()
+            .unwrap()
+            .location
+            .attach_mode
+            .clone()
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -2398,67 +2395,6 @@ impl Tenant {
    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }
-
-    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
-    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
-    /// resetting this tenant to a valid state if we fail.
-    pub(crate) async fn split_prepare(
-        &self,
-        child_shards: &Vec<TenantShardId>,
-    ) -> anyhow::Result<()> {
-        let timelines = self.timelines.lock().unwrap().clone();
-        for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
-            // to ensure that they do not start a split if currently in the process of doing these.
-
-            // Upload an index from the parent: this is partly to provide freshness for the
-            // child tenants that will copy it, and partly for general ease-of-debugging: there will
-            // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
-
-            // Shut down the timeline's remote client: this means that the indices we write
-            // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await?;
-
-            // Download methods can still be used after shutdown, as they don't flow through the remote client's
-            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
-            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
-            // we use here really is the remotely persistent one).
-            let result = tl_client
-                .download_index_file(&self.cancel)
-                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
-                .await?;
-            let index_part = match result {
-                MaybeDeletedIndexPart::Deleted(_) => {
-                    anyhow::bail!("Timeline deletion happened concurrently with split")
-                }
-                MaybeDeletedIndexPart::IndexPart(p) => p,
-            };
-
-            for child_shard in child_shards {
-                upload_index_part(
-                    remote_storage,
-                    child_shard,
-                    &timeline.timeline_id,
-                    self.generation,
-                    &index_part,
-                    &self.cancel,
-                )
-                .await?;
-            }
-        }
-
-        Ok(())
-    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2671,7 +2607,7 @@ impl Tenant {
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
-            self.walredo_mgr.as_ref().map(Arc::clone),
+            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
            state,
@@ -2689,7 +2625,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
        shard_identity: ShardIdentity,
-        walredo_mgr: Option<Arc<WalRedoManager>>,
+        walredo_mgr: Arc<WalRedoManager>,
        tenant_shard_id: TenantShardId,
        remote_storage: Option<GenericRemoteStorage>,
        deletion_queue_client: DeletionQueueClient,
@@ -2697,16 +2633,9 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            // reflect tenant state in metrics:
-            // - global per tenant state: TENANT_STATE_METRIC
-            // - "set" of broken tenants: BROKEN_TENANTS_SET
-            //
-            // set of broken tenants should not have zero counts so that it remains accessible for
-            // alerting.
-
+            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
-            let shard_id = tenant_shard_id.shard_slug().to_string();
-            let set_key = &[tid.as_str(), shard_id.as_str()][..];
+            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2715,13 +2644,21 @@ impl Tenant {
            let mut tuple = inspect_state(&rx.borrow_and_update());

            let is_broken = tuple.1;
-            let mut counted_broken = if is_broken {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after before tenant is removed, if ever
-                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
-                true
-            } else {
+            let mut counted_broken = if !is_broken {
+                // the tenant might be ignored and reloaded, so first remove any previous set
+                // element. it most likely has already been scraped, as these are manual operations
+                // right now. most likely we will add it back very soon.
+                drop(
+                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
+                );
                false
+            } else {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after
+                crate::metrics::BROKEN_TENANTS_SET
+                    .with_label_values(&[&tid, &shard_id_str])
+                    .set(1);
+                true
            };

            loop {
@@ -2730,9 +2667,10 @@ impl Tenant {
                current.inc();

                if rx.changed().await.is_err() {
-                    // tenant has been dropped
+                    // tenant has been dropped; decrement the counter because a tenant with that
+                    // state is no longer in tenant map, but allow any broken set item to exist
+                    // still.
                    current.dec();
-                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

@@ -2742,9 +2680,10 @@ impl Tenant {
                let is_broken = tuple.1;
                if is_broken && !counted_broken {
                    counted_broken = true;
-                    // insert the tenant_id (back) into the set while avoiding needless counter
-                    // access
-                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                    // insert the tenant_id (back) into the set
+                    crate::metrics::BROKEN_TENANTS_SET
+                        .with_label_values(&[&tid, &shard_id_str])
+                        .inc();
                }
            }
        });
@@ -3286,6 +3225,8 @@ impl Tenant {
                .context("branch initial metadata upload")?;
        }

+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
+
        Ok(new_timeline)
    }

@@ -3352,11 +3293,11 @@ impl Tenant {
            3,
            u32::MAX,
            "persist_initdb_tar_zst",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
-        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)
+        .await?;
+
+        Ok(())
    }

    /// - run initdb to init temporary instance and get bootstrap data
@@ -3503,6 +3444,12 @@ impl Tenant {
        // All done!
        let timeline = raw_timeline.finish_creation()?;

+        info!(
+            "created root timeline {} timeline.lsn {}",
+            timeline_id,
+            timeline.get_last_record_lsn()
+        );
+
        Ok(timeline)
    }

@@ -3794,10 +3741,6 @@ impl Tenant {

        Ok(())
    }
-
-    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf
-    }
 }

 fn remove_timeline_and_uninit_mark(
@@ -4060,10 +4003,6 @@ pub(crate) mod harness {
            })
        }

-        pub fn span(&self) -> tracing::Span {
-            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
-        }
-
        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
@@ -4117,7 +4056,7 @@ pub(crate) mod harness {
                .unwrap(),
                // This is a legacy/test code path: sharding isn't supported here.
                ShardIdentity::unsharded(),
-                Some(walredo_mgr),
+                walredo_mgr,
                self.tenant_shard_id,
                Some(self.remote_storage.clone()),
                self.deletion_queue.new_client(),
@@ -4668,7 +4607,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
-                .instrument(harness.span())
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
                .await
                .ok()
                .unwrap();
@@ -4709,7 +4648,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
-                .instrument(harness.span())
+                .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
                .await
                .ok()
                .unwrap();
@@ -4771,7 +4710,7 @@ mod tests {
        // so that all uploads finish & we can call harness.try_load() below again
        tenant
            .shutdown(Default::default(), true)
-            .instrument(harness.span())
+            .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
            .await
            .ok()
            .unwrap();
@@ -5304,7 +5243,7 @@ mod tests {
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
        }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -51,7 +51,7 @@ pub mod defaults {
    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

-#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
    /// Our generation is current as far as we know, and as far as we know we are the only attached
    /// pageserver.  This is the "normal" attachment mode.
@@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode {
    Stale,
 }

-#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) struct AttachedLocationConfig {
    pub(crate) generation: Generation,
    pub(crate) attach_mode: AttachmentMode,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -91,11 +91,9 @@ async fn create_remote_delete_mark(
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
-        cancel,
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-    .and_then(|x| x)
    .context("mark_upload")?;

    Ok(())
@@ -189,11 +187,9 @@ async fn remove_tenant_remote_delete_mark(
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)
        .context("remove_tenant_remote_delete_mark")?;
    }
    Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
@@ -23,7 +22,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
-use utils::{completion, crashsafe};
+use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -608,6 +607,13 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

+    info!(
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        generation = ?location_conf.location.generation,
+        attach_mode = ?location_conf.location.attach_mode,
+        "Attaching tenant"
+    );
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -645,6 +651,8 @@ pub(crate) async fn shutdown_all_tenants() {
 }

 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
+    use utils::completion;
+
    let mut join_set = JoinSet::new();

    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
@@ -683,7 +691,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                                    // going to log too many lines
                                    debug!("tenant successfully stopped");
                                }
-                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
+                                .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
                            );

                            total_attached += 1;
@@ -1199,7 +1207,7 @@ impl TenantManager {
        &self,
        tenant_shard_id: TenantShardId,
        drop_cache: bool,
-        ctx: &RequestContext,
+        ctx: RequestContext,
    ) -> anyhow::Result<()> {
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let Some(old_slot) = slot_guard.get_old_value() else {
@@ -1252,7 +1260,7 @@ impl TenantManager {
            None,
            self.tenants,
            SpawnMode::Normal,
-            ctx,
+            &ctx,
        )?;

        slot_guard.upsert(TenantSlot::Attached(tenant))?;
@@ -1374,164 +1382,6 @@ impl TenantManager {
        slot_guard.revert();
        result
    }
-
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
-    pub(crate) async fn shard_split(
-        &self,
-        tenant_shard_id: TenantShardId,
-        new_shard_count: ShardCount,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<TenantShardId>> {
-        let tenant = get_tenant(tenant_shard_id, true)?;
-
-        // Plan: identify what the new child shards will be
-        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
-        if new_shard_count <= ShardCount(effective_old_shard_count) {
-            anyhow::bail!("Requested shard count is not an increase");
-        }
-        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
-        if !expansion_factor.is_power_of_two() {
-            anyhow::bail!("Requested split is not a power of two");
-        }
-
-        let parent_shard_identity = tenant.shard_identity;
-        let parent_tenant_conf = tenant.get_tenant_conf();
-        let parent_generation = tenant.generation;
-
-        let child_shards = tenant_shard_id.split(new_shard_count);
-        tracing::info!(
-            "Shard {} splits into: {}",
-            tenant_shard_id.to_index(),
-            child_shards
-                .iter()
-                .map(|id| format!("{}", id.to_index()))
-                .join(",")
-        );
-
-        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
-        if let Err(e) = tenant.split_prepare(&child_shards).await {
-            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
-            // have been left in a partially-shut-down state.
-            tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
-            self.reset_tenant(tenant_shard_id, false, ctx).await?;
-            return Err(e);
-        }
-
-        self.resources.deletion_queue_client.flush_advisory();
-
-        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
-        drop(tenant);
-        let mut parent_slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let parent = match parent_slot_guard.get_old_value() {
-            Some(TenantSlot::Attached(t)) => t,
-            Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
-            Some(TenantSlot::InProgress(_)) => {
-                // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
-                // it would return an error.
-                unreachable!()
-            }
-            None => {
-                // We don't actually need the parent shard to still be attached to do our work, but it's
-                // a weird enough situation that the caller probably didn't want us to continue working
-                // if they had detached the tenant they requested the split on.
-                anyhow::bail!("Detached parent shard in the middle of split!")
-            }
-        };
-
-        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
-        // TODO: erase the dentries from the parent
-
-        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
-        // child shards to reach this point.
-        let mut target_lsns = HashMap::new();
-        for timeline in parent.timelines.lock().unwrap().clone().values() {
-            target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
-        }
-
-        // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
-        // and could slow down the children trying to catch up.
-
-        // Phase 3: Spawn the child shards
-        for child_shard in &child_shards {
-            let mut child_shard_identity = parent_shard_identity;
-            child_shard_identity.count = child_shard.shard_count;
-            child_shard_identity.number = child_shard.shard_number;
-
-            let child_location_conf = LocationConf {
-                mode: LocationMode::Attached(AttachedLocationConfig {
-                    generation: parent_generation,
-                    attach_mode: AttachmentMode::Single,
-                }),
-                shard: child_shard_identity,
-                tenant_conf: parent_tenant_conf,
-            };
-
-            self.upsert_location(
-                *child_shard,
-                child_location_conf,
-                None,
-                SpawnMode::Normal,
-                ctx,
-            )
-            .await?;
-        }
-
-        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
-        for child_shard_id in &child_shards {
-            let child_shard = {
-                let locked = TENANTS.read().unwrap();
-                let peek_slot =
-                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
-                peek_slot.and_then(|s| s.get_attached()).cloned()
-            };
-            if let Some(t) = child_shard {
-                let timelines = t.timelines.lock().unwrap().clone();
-                for timeline in timelines.values() {
-                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
-                        continue;
-                    };
-
-                    tracing::info!(
-                        "Waiting for child shard {}/{} to reach target lsn {}...",
-                        child_shard_id,
-                        timeline.timeline_id,
-                        target_lsn
-                    );
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
-                        // Failure here might mean shutdown, in any case this part is an optimization
-                        // and we shouldn't hold up the split operation.
-                        tracing::warn!(
-                            "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
-                            timeline.timeline_id
-                        );
-                    } else {
-                        tracing::info!(
-                            "Child shard {}/{} reached target lsn {}",
-                            child_shard_id,
-                            timeline.timeline_id,
-                            target_lsn
-                        );
-                    }
-                }
-            }
-        }
-
-        // Phase 5: Shut down the parent shard.
-        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(other) => {
-                other.wait().await;
-            }
-        }
-        parent_slot_guard.drop_old_value()?;
-
-        // Phase 6: Release the InProgress on the parent shard
-        drop(parent_slot_guard);
-
-        Ok(child_shards)
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1877,7 +1727,6 @@ pub(crate) async fn ignore_tenant(
    ignore_tenant0(conf, &TENANTS, tenant_id).await
 }

-#[instrument(skip_all, fields(shard_id))]
 async fn ignore_tenant0(
    conf: &'static PageServerConf,
    tenants: &std::sync::RwLock<TenantsMap>,
@@ -1885,10 +1734,6 @@ async fn ignore_tenant0(
 ) -> Result<(), TenantStateError> {
    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    tracing::Span::current().record(
-        "shard_id",
-        tracing::field::display(tenant_shard_id.shard_slug()),
-    );

    remove_tenant_from_memory(tenants, tenant_shard_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
@@ -2284,7 +2129,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
@@ -2366,6 +2211,8 @@ async fn remove_tenant_from_memory<V, F>(
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
+    use utils::completion;
+
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

@@ -2518,7 +2365,7 @@ pub(crate) async fn immediate_gc(
 mod tests {
    use std::collections::BTreeMap;
    use std::sync::Arc;
-    use tracing::Instrument;
+    use tracing::{info_span, Instrument};

    use crate::tenant::mgr::TenantSlot;

@@ -2529,16 +2376,17 @@ mod tests {
        // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
        // wait for it to complete before proceeding.

-        let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
-        let (t, _ctx) = h.load().await;
+        let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant")
+            .unwrap()
+            .load()
+            .await;

        // harness loads it to active, which is forced and nothing is running on the tenant

        let id = t.tenant_shard_id();

        // tenant harness configures the logging and we cannot escape it
-        let span = h.span();
-        let _e = span.enter();
+        let _e = info_span!("testing", tenant_id = %id).entered();

        let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
        let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));
@@ -2559,7 +2407,7 @@ mod tests {
                    };
                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
                }
-                .instrument(h.span())
+                .instrument(info_span!("foobar", tenant_id = %id))
            });

            // now the long cleanup should be in place, with the stopping state
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -217,7 +217,6 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::download::download_retry;
 use crate::tenant::storage_layer::AsLayerDesc;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
@@ -263,11 +262,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;

-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -331,6 +325,11 @@ pub struct RemoteTimelineClient {
    cancel: CancellationToken,
 }

+/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
+/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
+const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
+
 /// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
 ///
 /// This is a convenience for the various upload functions.  In future
@@ -507,7 +506,7 @@ impl RemoteTimelineClient {
    /// Download index file
    pub async fn download_index_file(
        &self,
-        cancel: &CancellationToken,
+        cancel: CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
@@ -1047,11 +1046,9 @@ impl RemoteTimelineClient {
            // when executed as part of tenant deletion this happens in the background
            2,
            "persist_index_part_with_deleted_flag",
-            &self.cancel,
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
        )
-        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
-        .and_then(|x| x)?;
+        .await?;

        // all good, disarm the guard and mark as success
        ScopeGuard::into_inner(undo_deleted_at);
@@ -1086,11 +1083,9 @@ impl RemoteTimelineClient {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "preserve_initdb_tar_zst",
-            &cancel.clone(),
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
-        .and_then(|x| x)
        .context("backing up initdb archive")?;
        Ok(())
    }
@@ -1146,19 +1141,20 @@ impl RemoteTimelineClient {
        // taking the burden of listing all the layers that we already know we should delete.
        self.deletion_queue_client.flush_immediate().await?;

-        let cancel = shutdown_token();
-
-        let remaining = download_retry(
+        let remaining = backoff::retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None)
+                    .list_files(Some(&timeline_storage_path))
                    .await
            },
-            "list remaining files",
-            &cancel,
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+            backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
        )
        .await
-        .context("list files remaining files")?;
+        .context("list prefixes")?;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1347,7 +1343,6 @@ impl RemoteTimelineClient {
    /// queue.
    ///
    async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
-        let cancel = shutdown_token();
        // Loop to retry until it completes.
        loop {
            // If we're requested to shut down, close up shop and exit.
@@ -1359,7 +1354,7 @@ impl RemoteTimelineClient {
            // the Future, but we're not 100% sure if the remote storage library
            // is cancellation safe, so we don't dare to do that. Hopefully, the
            // upload finishes or times out soon enough.
-            if cancel.is_cancelled() {
+            if task_mgr::is_shutdown_requested() {
                info!("upload task cancelled by shutdown request");
                match self.stop() {
                    Ok(()) => {}
@@ -1470,7 +1465,7 @@ impl RemoteTimelineClient {
                        retries,
                        DEFAULT_BASE_BACKOFF_SECONDS,
                        DEFAULT_MAX_BACKOFF_SECONDS,
-                        &cancel,
+                        &shutdown_token(),
                    )
                    .await;
                }
@@ -1949,7 +1944,6 @@ mod tests {
            tracing::info_span!(
                "test",
                tenant_id = %self.harness.tenant_shard_id.tenant_id,
-                shard_id = %self.harness.tenant_shard_id.shard_slug(),
                timeline_id = %TIMELINE_ID
            )
        }
@@ -1987,7 +1981,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let initial_index_part = match client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2081,7 +2075,7 @@ mod tests {

        // Download back the index.json, and check that the list of files is correct
        let index_part = match client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .unwrap()
        {
@@ -2283,7 +2277,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(&CancellationToken::new())
+            .download_index_file(CancellationToken::new())
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{
    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
 };
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
 use crate::TEMP_FILE_SUFFIX;
@@ -76,6 +76,7 @@ pub async fn download_layer_file<'a>(
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

+    let cancel_inner = cancel.clone();
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
            let destination_file = tokio::fs::File::create(&temp_file_path)
@@ -86,7 +87,7 @@ pub async fn download_layer_file<'a>(
            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
            // file: the write to local file doesn't start until after the request header is returned
            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
                .await
                .with_context(|| {
                    format!(
@@ -106,7 +107,7 @@ pub async fn download_layer_file<'a>(
            // we will imminiently try and write to again.
            let bytes_amount: u64 = match timeout_cancellable(
                DOWNLOAD_TIMEOUT,
-                cancel,
+                &cancel_inner,
                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
@@ -216,15 +217,16 @@ pub async fn list_remote_timelines(
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

+    let cancel_inner = cancel.clone();
    let listing = download_retry_forever(
        || {
            download_cancellable(
-                &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
+                &cancel_inner,
+                storage.list(Some(&remote_path), ListingMode::WithDelimiter),
            )
        },
        &format!("list timelines for {tenant_shard_id}"),
-        &cancel,
+        cancel,
    )
    .await?;

@@ -257,18 +259,19 @@ async fn do_download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    use futures::stream::StreamExt;

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

+    let cancel_inner = cancel.clone();
    let index_part_bytes = download_retry_forever(
        || async {
            // Cancellation: if is safe to cancel this future because we're just downloading into
            // a memory buffer, not touching local disk.
            let index_part_download =
-                download_cancellable(cancel, storage.download(&remote_path)).await?;
+                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;

            let mut index_part_bytes = Vec::new();
            let mut stream = std::pin::pin!(index_part_download.download_stream);
@@ -286,7 +289,7 @@ async fn do_download_index_part(
    .await?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
-        .with_context(|| format!("deserialize index part file at {remote_path:?}"))
+        .with_context(|| format!("download index part file at {remote_path:?}"))
        .map_err(DownloadError::Other)?;

    Ok(index_part)
@@ -303,7 +306,7 @@ pub(super) async fn download_index_part(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -323,8 +326,14 @@ pub(super) async fn download_index_part(
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
+    let res = do_download_index_part(
+        storage,
+        tenant_shard_id,
+        timeline_id,
+        my_generation,
+        cancel.clone(),
+    )
+    .await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -349,7 +358,7 @@ pub(super) async fn download_index_part(
        tenant_shard_id,
        timeline_id,
        my_generation.previous(),
-        cancel,
+        cancel.clone(),
    )
    .await;
    match res {
@@ -371,13 +380,16 @@ pub(super) async fn download_index_part(
    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
    // to constructing a full index path with no generation, because the generation is a suffix.
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-
-    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None).await },
-        "list index_part files",
-        cancel,
+    let indices = backoff::retry(
+        || async { storage.list_files(Some(&index_prefix)).await },
+        |_| false,
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "listing index_part files",
+        backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
    )
-    .await?;
+    .await
+    .map_err(DownloadError::Other)?;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
@@ -434,6 +446,8 @@ pub(crate) async fn download_initdb_tar_zst(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));

+    let cancel_inner = cancel.clone();
+
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
@@ -446,16 +460,18 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
+            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
+                .await
            {
                Ok(dl) => dl,
                Err(DownloadError::NotFound) => {
-                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
+                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
+                        .await?
                }
                Err(other) => Err(other)?,
            };
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
-            let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
+            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
            // not without thinking carefully about how to recover safely from cancelling a write to
@@ -494,12 +510,12 @@ pub(crate) async fn download_initdb_tar_zst(

 /// Helper function to handle retries for a download operation.
 ///
-/// Remote operations can fail due to rate limits (S3), spurious network
+/// Remote operations can fail due to rate limits (IAM, S3), spurious network
 /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-pub(super) async fn download_retry<T, O, F>(
+async fn download_retry<T, O, F>(
    op: O,
    description: &str,
    cancel: &CancellationToken,
@@ -510,21 +526,19 @@ where
 {
    backoff::retry(
        op,
-        DownloadError::is_permanent,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        description,
-        cancel,
+        backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
    )
    .await
-    .ok_or_else(|| DownloadError::Cancelled)
-    .and_then(|x| x)
 }

 async fn download_retry_forever<T, O, F>(
    op: O,
    description: &str,
-    cancel: &CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
@@ -532,13 +546,11 @@ where
 {
    backoff::retry(
        op,
-        DownloadError::is_permanent,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
        FAILED_DOWNLOAD_WARN_THRESHOLD,
        u32::MAX,
        description,
-        cancel,
+        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
    )
    .await
-    .ok_or_else(|| DownloadError::Cancelled)
-    .and_then(|x| x)
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
 use tracing::info;

 /// Serializes and uploads the given index part data to the remote storage.
-pub(crate) async fn upload_index_part<'a>(
+pub(super) async fn upload_index_part<'a>(
    storage: &'a GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
@@ -188,18 +188,16 @@ pub(crate) async fn time_travel_recover_tenant(
        backoff::retry(
            || async {
                storage
-                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
                    .await
            },
            |e| !matches!(e, TimeTravelError::Other(_)),
            warn_after,
            max_attempts,
            "time travel recovery of tenant prefix",
-            cancel,
+            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
        )
-        .await
-        .ok_or_else(|| TimeTravelError::Cancelled)
-        .and_then(|x| x)?;
+        .await?;
    }
    Ok(())
 }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -160,7 +160,7 @@ impl SecondaryTenant {
        &self.tenant_shard_id
    }

-    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
        self.detail.lock().unwrap().get_layers_for_eviction(self)
    }

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -146,15 +146,14 @@ impl SecondaryDetail {
        }
    }

-    /// Additionally returns the total number of layers, used for more stable relative access time
-    /// based eviction.
    pub(super) fn get_layers_for_eviction(
        &self,
        parent: &Arc<SecondaryTenant>,
-    ) -> (DiskUsageEvictionInfo, usize) {
-        let mut result = DiskUsageEvictionInfo::default();
-        let mut total_layers = 0;
-
+    ) -> DiskUsageEvictionInfo {
+        let mut result = DiskUsageEvictionInfo {
+            max_layer_size: None,
+            resident_layers: Vec::new(),
+        };
        for (timeline_id, timeline_detail) in &self.timelines {
            result
                .resident_layers
@@ -170,10 +169,6 @@ impl SecondaryDetail {
                        relative_last_activity: finite_f32::FiniteF32::ZERO,
                    }
                }));
-
-            // total might be missing currently downloading layers, but as a lower than actual
-            // value it is good enough approximation.
-            total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
        }
        result.max_layer_size = result
            .resident_layers
@@ -188,7 +183,7 @@ impl SecondaryDetail {
            result.resident_layers.len()
        );

-        (result, total_layers)
+        result
    }
 }

@@ -317,7 +312,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            .tenant_manager
            .get_secondary_tenant_shard(*tenant_shard_id);
        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
        };

        Ok(PendingDownload {
@@ -392,9 +389,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            }

            CompleteDownload {
-                secondary_state,
-                completed_at: Instant::now(),
-            }
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
    }
 }
@@ -533,18 +530,18 @@ impl<'a> TenantDownloader<'a> {
                    .map_err(UpdateError::from)?;
                let mut heatmap_bytes = Vec::new();
                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
-                let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
                Ok(heatmap_bytes)
            },
            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "download heatmap",
-            &self.secondary_state.cancel,
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
        )
-        .await
-        .ok_or_else(|| UpdateError::Cancelled)
-        .and_then(|x| x)?;
+        .await?;

        SECONDARY_MODE.download_heatmap.inc();

--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -371,6 +371,8 @@ async fn upload_tenant_heatmap(
    };
    let timelines = tenant.timelines.lock().unwrap().clone();

+    let tenant_cancel = tenant.cancel.clone();
+
    // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
    // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
    // in remote storage.
@@ -399,7 +401,6 @@ async fn upload_tenant_heatmap(

    // Serialize the heatmap
    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
    let size = bytes.len();

    // Drop out early if nothing changed since our last upload
@@ -410,12 +411,13 @@ async fn upload_tenant_heatmap(

    let path = remote_heatmap_path(tenant.get_tenant_shard_id());

-    let cancel = &tenant.cancel;
-
+    // Write the heatmap.
    tracing::debug!("Uploading {size} byte heatmap to {path}");
    if let Err(e) = backoff::retry(
        || async {
-            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
+            let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
+                bytes.clone(),
+            ))));
            remote_storage
                .upload_storage_object(bytes, size, &path)
                .await
@@ -424,13 +426,11 @@ async fn upload_tenant_heatmap(
        3,
        u32::MAX,
        "Uploading heatmap",
-        cancel,
+        backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Shutting down"))
-    .and_then(|x| x)
    {
-        if cancel.is_cancelled() {
+        if tenant_cancel.is_cancelled() {
            return Err(UploadHeatmapError::Cancelled);
        } else {
            return Err(e.into());
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -0,0 +1,17 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
+    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
+        panic!("missing extractors: {missing:?}")
+    }
+}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,19 +609,7 @@ impl DeltaLayerWriter {
        key_end: Key,
        timeline: &Arc<Timeline>,
    ) -> anyhow::Result<ResidentLayer> {
-        let inner = self.inner.take().unwrap();
-        let temp_path = inner.path.clone();
-        let result = inner.finish(key_end, timeline).await;
-        // The delta layer files can sometimes be really large. Clean them up.
-        if result.is_err() {
-            tracing::warn!(
-                "Cleaning up temporary delta file {temp_path} after error during writing"
-            );
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
-            }
-        }
-        result
+        self.inner.take().unwrap().finish(key_end, timeline).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,7 +6,7 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::repository::{Key, Value};
+use crate::repository::{Key, Value, MAX_VALUE_SIZE};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -284,6 +284,15 @@ impl InMemoryLayer {
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

+        if let Value::Image(buf) = val {
+            if buf.len() > MAX_VALUE_SIZE {
+                tracing::warn!(
+                    "Can't put value of size {} above limit {MAX_VALUE_SIZE} for key {key}",
+                    buf.len()
+                );
+            }
+        }
+
        let off = {
            // Avoid doing allocations for "small" values.
            // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,7 +15,6 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
@@ -837,8 +836,6 @@ impl LayerInner {
        timeline: Arc<Timeline>,
        permit: heavier_once_cell::InitPermit,
    ) -> Result<heavier_once_cell::InitPermit, DownloadError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
        let task_name = format!("download layer {}", self);

        let (tx, rx) = tokio::sync::oneshot::channel();
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -199,9 +199,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
-            if let Some(walredo_mgr) = &tenant.walredo_mgr {
-                walredo_mgr.maybe_quiesce(period * 10);
-            }
+            tenant.walredo_mgr.maybe_quiesce(period * 10);

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -215,8 +215,8 @@ pub struct Timeline {
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,

-    // WAL redo manager. `None` only for broken tenants.
-    walredo_mgr: Option<Arc<super::WalRedoManager>>,
+    // WAL redo manager
+    walredo_mgr: Arc<super::WalRedoManager>,

    /// Remote storage client.
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
@@ -1138,7 +1138,7 @@ impl Timeline {
    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
    /// the graceful [`Timeline::flush_and_shutdown`] function.
    pub(crate) async fn shutdown(&self) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();

        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
@@ -1427,7 +1427,7 @@ impl Timeline {
        tenant_shard_id: TenantShardId,
        generation: Generation,
        shard_identity: ShardIdentity,
-        walredo_mgr: Option<Arc<super::WalRedoManager>>,
+        walredo_mgr: Arc<super::WalRedoManager>,
        resources: TimelineResources,
        pg_version: u32,
        state: TimelineState,
@@ -1964,7 +1964,7 @@ impl Timeline {
                    .await;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)),
+            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
        );
    }

@@ -2151,7 +2151,7 @@ impl Timeline {
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
        // We should never be calculating logical sizes on shard !=0, because these shards do not have
        // accurate relation sizes, and they do not emit consumption metrics.
        debug_assert!(self.tenant_shard_id.is_zero());
@@ -2849,7 +2849,7 @@ impl Timeline {
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> Result<(), FlushLayerError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -4457,9 +4457,6 @@ impl Timeline {

                let img = match self
                    .walredo_mgr
-                    .as_ref()
-                    .context("timeline has no walredo manager")
-                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                    .await
                    .context("reconstruct a page image")
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1 +1,20 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};

+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
+        once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
+
+    let fields: [&dyn Extractor; 2] = [
+        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ];
+    if let Err(missing) = check_fields_present!(fields) {
+        panic!("missing extractors: {missing:?}")
+    }
+}
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -343,23 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                            modification.commit(&ctx).await?;
                            uncommitted_records = 0;
                            filtered_records = 0;
-
-                            //
-                            // We should check checkpoint distance after appending each ingest_batch_size bytes because otherwise
-                            // layer size can become much larger than `checkpoint_distance`.
-                            // It can append because wal-sender is sending WAL using 125kb chucks and some WAL records can cause writing large
-                            // amount of data to key-value storage. So performing this check only after processing
-                            // all WAL records in the chunk, can cause huge L0 layer files.
-                            //
-                            timeline
-                                .check_checkpoint_distance()
-                                .await
-                                .with_context(|| {
-                                    format!(
-                                        "Failed to check checkpoint distance for timeline {}",
-                                        timeline.timeline_id
-                                    )
-                                })?;
                        }
                    }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -28,10 +28,9 @@ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;

-pub use pageserver_api::models::virtual_file as api;
-pub(crate) mod io_engine;
+mod io_engine;
 mod open_options;
-pub(crate) use io_engine::IoEngineKind;
+pub use io_engine::IoEngineKind;
 pub(crate) use open_options::*;

 ///
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -7,100 +7,67 @@
 //!
 //! Then use [`get`] and  [`super::OpenOptions`].

-pub(crate) use super::api::IoEngineKind;
-#[derive(Clone, Copy)]
-#[repr(u8)]
-pub(crate) enum IoEngine {
-    NotSet,
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    Hash,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum IoEngineKind {
    StdFs,
    #[cfg(target_os = "linux")]
    TokioEpollUring,
 }

-impl From<IoEngineKind> for IoEngine {
-    fn from(value: IoEngineKind) -> Self {
-        match value {
-            IoEngineKind::StdFs => IoEngine::StdFs,
-            #[cfg(target_os = "linux")]
-            IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring,
-        }
-    }
-}
-
-impl TryFrom<u8> for IoEngine {
-    type Error = u8;
-
-    fn try_from(value: u8) -> Result<Self, Self::Error> {
-        Ok(match value {
-            v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet,
-            v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs,
-            #[cfg(target_os = "linux")]
-            v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring,
-            x => return Err(x),
-        })
-    }
-}
-
-static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
-
-pub(crate) fn set(engine_kind: IoEngineKind) {
-    let engine: IoEngine = engine_kind.into();
-    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
-    #[cfg(not(test))]
-    {
-        let metric = &crate::metrics::virtual_file_io_engine::KIND;
-        metric.reset();
-        metric
-            .with_label_values(&[&format!("{engine_kind}")])
-            .set(1);
-    }
-}
+static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();

 #[cfg(not(test))]
-pub(super) fn init(engine_kind: IoEngineKind) {
-    set(engine_kind);
-}
-
-pub(super) fn get() -> IoEngine {
-    let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap();
-    if cfg!(test) {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
-        match cur {
-            IoEngine::NotSet => {
-                let kind = match std::env::var(env_var_name) {
-                    Ok(v) => match v.parse::<IoEngineKind>() {
-                        Ok(engine_kind) => engine_kind,
-                        Err(e) => {
-                            panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
-                        }
-                    },
-                    Err(std::env::VarError::NotPresent) => {
-                        crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
-                            .parse()
-                            .unwrap()
-                    }
-                    Err(std::env::VarError::NotUnicode(_)) => {
-                        panic!("env var {env_var_name} is not unicode");
-                    }
-                };
-                self::set(kind);
-                self::get()
-            }
-            x => x,
-        }
-    } else {
-        cur
+pub(super) fn init(engine: IoEngineKind) {
+    if IO_ENGINE.set(engine).is_err() {
+        panic!("called twice");
    }
+    crate::metrics::virtual_file_io_engine::KIND
+        .with_label_values(&[&format!("{engine}")])
+        .set(1);
 }

-use std::{
-    os::unix::prelude::FileExt,
-    sync::atomic::{AtomicU8, Ordering},
-};
+pub(super) fn get() -> &'static IoEngineKind {
+    #[cfg(test)]
+    {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
+            Ok(v) => match v.parse::<IoEngineKind>() {
+                Ok(engine_kind) => engine_kind,
+                Err(e) => {
+                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                }
+            },
+            Err(std::env::VarError::NotPresent) => {
+                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                    .parse()
+                    .unwrap()
+            }
+            Err(std::env::VarError::NotUnicode(_)) => {
+                panic!("env var {env_var_name} is not unicode");
+            }
+        })
+    }
+    #[cfg(not(test))]
+    IO_ENGINE.get().unwrap()
+}
+
+use std::os::unix::prelude::FileExt;

 use super::FileGuard;

-impl IoEngine {
+impl IoEngineKind {
    pub(super) async fn read_at<B>(
        &self,
        file_guard: FileGuard,
@@ -111,8 +78,7 @@ impl IoEngine {
        B: tokio_epoll_uring::BoundedBufMut + Send,
    {
        match self {
-            IoEngine::NotSet => panic!("not initialized"),
-            IoEngine::StdFs => {
+            IoEngineKind::StdFs => {
                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
                let dst = unsafe {
                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
@@ -130,7 +96,7 @@ impl IoEngine {
                ((file_guard, buf), res)
            }
            #[cfg(target_os = "linux")]
-            IoEngine::TokioEpollUring => {
+            IoEngineKind::TokioEpollUring => {
                let system = tokio_epoll_uring::thread_local_system().await;
                let (resources, res) = system.read(file_guard, offset, buf).await;
                (
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,6 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];

-use super::io_engine::IoEngine;
+use super::IoEngineKind;
 use std::{os::fd::OwnedFd, path::Path};

 #[derive(Debug, Clone)]
@@ -13,10 +13,9 @@ pub enum OpenOptions {
 impl Default for OpenOptions {
    fn default() -> Self {
        match super::io_engine::get() {
-            IoEngine::NotSet => panic!("io engine not set"),
-            IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
            #[cfg(target_os = "linux")]
-            IoEngine::TokioEpollUring => {
+            IoEngineKind::TokioEpollUring => {
                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
            }
        }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -17,30 +17,71 @@
 //! records. It achieves it by dropping privileges before replaying
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
-
-/// Process lifecycle and abstracction for the IPC protocol.
-mod process;
-
-/// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
-
-use crate::config::PageServerConf;
-use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
-    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
-};
-use crate::repository::Key;
-use crate::walrecord::NeonWalRecord;
+//!
 use anyhow::Context;
-use bytes::{Bytes, BytesMut};
-use pageserver_api::key::key_to_rel_block;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::{BufMut, Bytes, BytesMut};
+use nix::poll::*;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
-use std::sync::{Arc, RwLock};
+use serde::Serialize;
+use std::collections::VecDeque;
+use std::io;
+use std::io::prelude::*;
+use std::ops::{Deref, DerefMut};
+use std::os::unix::io::AsRawFd;
+use std::process::Stdio;
+use std::process::{Child, ChildStdin, ChildStdout, Command};
+use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::lsn::Lsn;
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
+
+#[cfg(feature = "testing")]
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::config::PageServerConf;
+use crate::metrics::{
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
+    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+};
+use crate::repository::Key;
+use crate::walrecord::NeonWalRecord;
+
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::v14::nonrelfile_utils::{
+    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
+    transaction_id_set_status,
+};
+use postgres_ffi::BLCKSZ;
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
+pub(crate) struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}

 ///
 /// This is the real implementation that uses a Postgres process to
@@ -53,7 +94,22 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
+    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
+}
+
+/// Can this request be served by neon redo functions
+/// or we need to pass it to wal-redo postgres process?
+fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
+    // Currently, we don't have bespoken Rust code to replay any
+    // Postgres WAL records. But everything else is handled in neon.
+    #[allow(clippy::match_like_matches_macro)]
+    match rec {
+        NeonWalRecord::Postgres {
+            will_init: _,
+            rec: _,
+        } => false,
+        _ => true,
+    }
 }

 ///
@@ -83,10 +139,10 @@ impl PostgresRedoManager {

        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
        let mut img = base_img.map(|p| p.1);
-        let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
+        let mut batch_neon = can_apply_in_neon(&records[0].1);
        let mut batch_start = 0;
        for (i, record) in records.iter().enumerate().skip(1) {
-            let rec_neon = apply_neon::can_apply_in_neon(&record.1);
+            let rec_neon = can_apply_in_neon(&record.1);

            if rec_neon != batch_neon {
                let result = if batch_neon {
@@ -192,7 +248,7 @@ impl PostgresRedoManager {
        let mut n_attempts = 0u32;
        loop {
            // launch the WAL redo process on first use
-            let proc: Arc<process::WalRedoProcess> = {
+            let proc: Arc<WalRedoProcess> = {
                let proc_guard = self.redo_process.read().unwrap();
                match &*proc_guard {
                    None => {
@@ -203,7 +259,7 @@ impl PostgresRedoManager {
                            None => {
                                let start = Instant::now();
                                let proc = Arc::new(
-                                    process::WalRedoProcess::launch(
+                                    WalRedoProcess::launch(
                                        self.conf,
                                        self.tenant_shard_id,
                                        pg_version,
@@ -231,8 +287,9 @@ impl PostgresRedoManager {
            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
            let result = proc
-                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

            let duration = started_at.elapsed();
@@ -359,12 +416,732 @@ impl PostgresRedoManager {
        _record_lsn: Lsn,
        record: &NeonWalRecord,
    ) -> anyhow::Result<()> {
-        apply_neon::apply_in_neon(record, key, page)?;
+        match record {
+            NeonWalRecord::Postgres {
+                will_init: _,
+                rec: _,
+            } => {
+                anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
+            }
+            NeonWalRecord::ClearVisibilityMapFlags {
+                new_heap_blkno,
+                old_heap_blkno,
+                flags,
+            } => {
+                // sanity check that this is modifying the correct relation
+                let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
+                assert!(
+                    rel.forknum == VISIBILITYMAP_FORKNUM,
+                    "ClearVisibilityMapFlags record on unexpected rel {}",
+                    rel
+                );
+                if let Some(heap_blkno) = *new_heap_blkno {
+                    // Calculate the VM block and offset that corresponds to the heap block.
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    // Check that we're modifying the correct VM block.
+                    assert!(map_block == blknum);
+
+                    // equivalent to PageGetContents(page)
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
+
+                // Repeat for 'old_heap_blkno', if any
+                if let Some(heap_blkno) = *old_heap_blkno {
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    assert!(map_block == blknum);
+
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
+                }
+            }
+            // Non-relational WAL records are handled here, with custom code that has the
+            // same effects as the corresponding Postgres WAL redo function.
+            NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetCommitted record with unexpected key {}",
+                    key
+                );
+                for &xid in xids {
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                    // Check that we're modifying the correct CLOG block.
+                    assert!(
+                        segno == expected_segno,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetCommitted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+
+                    transaction_id_set_status(
+                        xid,
+                        pg_constants::TRANSACTION_STATUS_COMMITTED,
+                        page,
+                    );
+                }
+
+                // Append the timestamp
+                if page.len() == BLCKSZ as usize + 8 {
+                    page.truncate(BLCKSZ as usize);
+                }
+                if page.len() == BLCKSZ as usize {
+                    page.extend_from_slice(&timestamp.to_be_bytes());
+                } else {
+                    warn!(
+                        "CLOG blk {} in seg {} has invalid size {}",
+                        blknum,
+                        segno,
+                        page.len()
+                    );
+                }
+            }
+            NeonWalRecord::ClogSetAborted { xids } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::Clog,
+                    "ClogSetAborted record with unexpected key {}",
+                    key
+                );
+                for &xid in xids {
+                    let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+                    // Check that we're modifying the correct CLOG block.
+                    assert!(
+                        segno == expected_segno,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "ClogSetAborted record for XID {} with unexpected key {}",
+                        xid,
+                        key
+                    );
+
+                    transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
+                }
+            }
+            NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactOffsets,
+                    "MultixactOffsetCreate record with unexpected key {}",
+                    key
+                );
+                // Compute the block and offset to modify.
+                // See RecordNewMultiXact in PostgreSQL sources.
+                let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+                let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+                let offset = (entryno * 4) as usize;
+
+                // Check that we're modifying the correct multixact-offsets block.
+                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                assert!(
+                    segno == expected_segno,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                    mid,
+                    key
+                );
+                assert!(
+                    blknum == expected_blknum,
+                    "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
+                    mid,
+                    key
+                );
+
+                LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
+            }
+            NeonWalRecord::MultixactMembersCreate { moff, members } => {
+                let (slru_kind, segno, blknum) =
+                    key_to_slru_block(key).context("invalid record")?;
+                assert_eq!(
+                    slru_kind,
+                    SlruKind::MultiXactMembers,
+                    "MultixactMembersCreate record with unexpected key {}",
+                    key
+                );
+                for (i, member) in members.iter().enumerate() {
+                    let offset = moff + i as u32;
+
+                    // Compute the block and offset to modify.
+                    // See RecordNewMultiXact in PostgreSQL sources.
+                    let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                    let memberoff = mx_offset_to_member_offset(offset);
+                    let flagsoff = mx_offset_to_flags_offset(offset);
+                    let bshift = mx_offset_to_flags_bitshift(offset);
+
+                    // Check that we're modifying the correct multixact-members block.
+                    let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    assert!(
+                        segno == expected_segno,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                        moff,
+                        key
+                    );
+                    assert!(
+                        blknum == expected_blknum,
+                        "MultiXactMembersCreate record for offset {} with unexpected key {}",
+                        moff,
+                        key
+                    );
+
+                    let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                    flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+                    flagsval |= member.status << bshift;
+                    LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
+                    LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
+                }
+            }
+        }

        Ok(())
    }
 }

+struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+            ($file:ident) => {{
+                let res = set_nonblock($file.as_raw_fd());
+                if let Err(e) = &res {
+                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+                }
+                res
+            }};
+        }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    fn apply_wal_records(
+        &self,
+        tag: BufferTag,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
+
+/// Wrapper type around `std::process::Child` which guarantees that the child
+/// will be killed and waited-for by this process before being dropped.
+struct NoLeakChild {
+    tenant_id: TenantShardId,
+    child: Option<Child>,
+}
+
+impl Deref for NoLeakChild {
+    type Target = Child;
+
+    fn deref(&self) -> &Self::Target {
+        self.child.as_ref().expect("must not use from drop")
+    }
+}
+
+impl DerefMut for NoLeakChild {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.child.as_mut().expect("must not use from drop")
+    }
+}
+
+impl NoLeakChild {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+        let child = command.spawn()?;
+        Ok(NoLeakChild {
+            tenant_id,
+            child: Some(child),
+        })
+    }
+
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        Self::kill_and_wait_impl(child, cause);
+    }
+
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
+        let res = child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match child.wait() {
+            Ok(exit_status) => {
+                info!(exit_status = %exit_status, "wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
+            }
+        }
+    }
+}
+
+impl Drop for NoLeakChild {
+    fn drop(&mut self) {
+        let child = match self.child.take() {
+            Some(child) => child,
+            None => return,
+        };
+        let tenant_shard_id = self.tenant_id;
+        // Offload the kill+wait of the child process into the background.
+        // If someone stops the runtime, we'll leak the child process.
+        // We can ignore that case because we only stop the runtime on pageserver exit.
+        tokio::runtime::Handle::current().spawn(async move {
+            tokio::task::spawn_blocking(move || {
+                // Intentionally don't inherit the tracing context from whoever is dropping us.
+                // This thread here is going to outlive of our dropper.
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
+                let _entered = span.enter();
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
+            })
+            .await
+        });
+    }
+}
+
+trait NoLeakChildCommandExt {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+}
+
+impl NoLeakChildCommandExt for Command {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+        NoLeakChild::spawn(tenant_id, self)
+    }
+}
+
+// Functions for constructing messages to send to the postgres WAL redo
+// process. See pgxn/neon_walredo/walredoproc.c for
+// explanation of the protocol.
+
+fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'B');
+    buf.put_u32(len as u32);
+
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
+fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+    assert!(base_img.len() == 8192);
+
+    let len = 4 + 1 + 4 * 4 + base_img.len();
+
+    buf.put_u8(b'P');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+    buf.put(base_img);
+}
+
+fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+    let len = 4 + 8 + rec.len();
+
+    buf.put_u8(b'A');
+    buf.put_u32(len as u32);
+    buf.put_u64(endlsn.0);
+    buf.put(rec);
+}
+
+fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+    let len = 4 + 1 + 4 * 4;
+
+    buf.put_u8(b'G');
+    buf.put_u32(len as u32);
+    tag.ser_into(buf)
+        .expect("serialize BufferTag should always succeed");
+}
+
 #[cfg(test)]
 mod tests {
    use super::PostgresRedoManager;
@@ -373,7 +1150,6 @@ mod tests {
    use bytes::Bytes;
    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
-    use tracing::Instrument;
    use utils::{id::TenantId, lsn::Lsn};

    #[tokio::test]
@@ -398,7 +1174,6 @@ mod tests {
                short_records(),
                14,
            )
-            .instrument(h.span())
            .await
            .unwrap();

@@ -426,7 +1201,6 @@ mod tests {
                short_records(),
                14,
            )
-            .instrument(h.span())
            .await
            .unwrap();

@@ -447,7 +1221,6 @@ mod tests {
                short_records(),
                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
            )
-            .instrument(h.span())
            .await
            .unwrap_err();
    }
@@ -476,7 +1249,6 @@ mod tests {
        // underscored because unused, except for removal at drop
        _repo_dir: camino_tempfile::Utf8TempDir,
        manager: PostgresRedoManager,
-        tenant_shard_id: TenantShardId,
    }

    impl RedoHarness {
@@ -493,11 +1265,7 @@ mod tests {
            Ok(RedoHarness {
                _repo_dir: repo_dir,
                manager,
-                tenant_shard_id,
            })
        }
-        fn span(&self) -> tracing::Span {
-            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
-        }
    }
 }
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,235 +0,0 @@
-use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
-use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
-use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
-use pageserver_api::reltag::SlruKind;
-use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
-use postgres_ffi::v14::nonrelfile_utils::{
-    mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
-    transaction_id_set_status,
-};
-use postgres_ffi::BLCKSZ;
-use tracing::*;
-
-/// Can this request be served by neon redo functions
-/// or we need to pass it to wal-redo postgres process?
-pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
-    // Currently, we don't have bespoken Rust code to replay any
-    // Postgres WAL records. But everything else is handled in neon.
-    #[allow(clippy::match_like_matches_macro)]
-    match rec {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => false,
-        _ => true,
-    }
-}
-
-pub(crate) fn apply_in_neon(
-    record: &NeonWalRecord,
-    key: Key,
-    page: &mut BytesMut,
-) -> Result<(), anyhow::Error> {
-    match record {
-        NeonWalRecord::Postgres {
-            will_init: _,
-            rec: _,
-        } => {
-            anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
-        }
-        NeonWalRecord::ClearVisibilityMapFlags {
-            new_heap_blkno,
-            old_heap_blkno,
-            flags,
-        } => {
-            // sanity check that this is modifying the correct relation
-            let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
-            assert!(
-                rel.forknum == VISIBILITYMAP_FORKNUM,
-                "ClearVisibilityMapFlags record on unexpected rel {}",
-                rel
-            );
-            if let Some(heap_blkno) = *new_heap_blkno {
-                // Calculate the VM block and offset that corresponds to the heap block.
-                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                // Check that we're modifying the correct VM block.
-                assert!(map_block == blknum);
-
-                // equivalent to PageGetContents(page)
-                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                map[map_byte as usize] &= !(flags << map_offset);
-            }
-
-            // Repeat for 'old_heap_blkno', if any
-            if let Some(heap_blkno) = *old_heap_blkno {
-                let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
-                let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
-                let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
-
-                assert!(map_block == blknum);
-
-                let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
-
-                map[map_byte as usize] &= !(flags << map_offset);
-            }
-        }
-        // Non-relational WAL records are handled here, with custom code that has the
-        // same effects as the corresponding Postgres WAL redo function.
-        NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::Clog,
-                "ClogSetCommitted record with unexpected key {}",
-                key
-            );
-            for &xid in xids {
-                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                // Check that we're modifying the correct CLOG block.
-                assert!(
-                    segno == expected_segno,
-                    "ClogSetCommitted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "ClogSetCommitted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-
-                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
-            }
-
-            // Append the timestamp
-            if page.len() == BLCKSZ as usize + 8 {
-                page.truncate(BLCKSZ as usize);
-            }
-            if page.len() == BLCKSZ as usize {
-                page.extend_from_slice(&timestamp.to_be_bytes());
-            } else {
-                warn!(
-                    "CLOG blk {} in seg {} has invalid size {}",
-                    blknum,
-                    segno,
-                    page.len()
-                );
-            }
-        }
-        NeonWalRecord::ClogSetAborted { xids } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::Clog,
-                "ClogSetAborted record with unexpected key {}",
-                key
-            );
-            for &xid in xids {
-                let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-                // Check that we're modifying the correct CLOG block.
-                assert!(
-                    segno == expected_segno,
-                    "ClogSetAborted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "ClogSetAborted record for XID {} with unexpected key {}",
-                    xid,
-                    key
-                );
-
-                transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
-            }
-        }
-        NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::MultiXactOffsets,
-                "MultixactOffsetCreate record with unexpected key {}",
-                key
-            );
-            // Compute the block and offset to modify.
-            // See RecordNewMultiXact in PostgreSQL sources.
-            let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-            let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-            let offset = (entryno * 4) as usize;
-
-            // Check that we're modifying the correct multixact-offsets block.
-            let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-            assert!(
-                segno == expected_segno,
-                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                mid,
-                key
-            );
-            assert!(
-                blknum == expected_blknum,
-                "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
-                mid,
-                key
-            );
-
-            LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
-        }
-        NeonWalRecord::MultixactMembersCreate { moff, members } => {
-            let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
-            assert_eq!(
-                slru_kind,
-                SlruKind::MultiXactMembers,
-                "MultixactMembersCreate record with unexpected key {}",
-                key
-            );
-            for (i, member) in members.iter().enumerate() {
-                let offset = moff + i as u32;
-
-                // Compute the block and offset to modify.
-                // See RecordNewMultiXact in PostgreSQL sources.
-                let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                let memberoff = mx_offset_to_member_offset(offset);
-                let flagsoff = mx_offset_to_flags_offset(offset);
-                let bshift = mx_offset_to_flags_bitshift(offset);
-
-                // Check that we're modifying the correct multixact-members block.
-                let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                assert!(
-                    segno == expected_segno,
-                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                    moff,
-                    key
-                );
-                assert!(
-                    blknum == expected_blknum,
-                    "MultiXactMembersCreate record for offset {} with unexpected key {}",
-                    moff,
-                    key
-                );
-
-                let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
-                flagsval |= member.status << bshift;
-                LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
-                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
-            }
-        }
-    }
-    Ok(())
-}
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,408 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-mod no_leak_child;
-/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
-mod protocol;
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -1,126 +0,0 @@
-use tracing;
-use tracing::error;
-use tracing::info;
-use tracing::instrument;
-
-use crate::metrics::WalRedoKillCause;
-use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
-
-use std::io;
-use std::process::Command;
-
-use std::ops::DerefMut;
-
-use std::ops::Deref;
-
-use std::process::Child;
-
-use pageserver_api::shard::TenantShardId;
-
-/// Wrapper type around `std::process::Child` which guarantees that the child
-/// will be killed and waited-for by this process before being dropped.
-pub(crate) struct NoLeakChild {
-    pub(crate) tenant_id: TenantShardId,
-    pub(crate) child: Option<Child>,
-}
-
-impl Deref for NoLeakChild {
-    type Target = Child;
-
-    fn deref(&self) -> &Self::Target {
-        self.child.as_ref().expect("must not use from drop")
-    }
-}
-
-impl DerefMut for NoLeakChild {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.child.as_mut().expect("must not use from drop")
-    }
-}
-
-impl NoLeakChild {
-    pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
-        let child = command.spawn()?;
-        Ok(NoLeakChild {
-            tenant_id,
-            child: Some(child),
-        })
-    }
-
-    pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        Self::kill_and_wait_impl(child, cause);
-    }
-
-    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
-    pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
-        scopeguard::defer! {
-            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
-        }
-        let res = child.kill();
-        if let Err(e) = res {
-            // This branch is very unlikely because:
-            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
-            // - This is the only place that calls .kill()
-            // - We consume `self`, so, .kill() can't be called twice.
-            // - If the process exited by itself or was killed by someone else,
-            //   .kill() will still succeed because we haven't wait()'ed yet.
-            //
-            // So, if we arrive here, we have really no idea what happened,
-            // whether the PID stored in self.child is still valid, etc.
-            // If this function were fallible, we'd return an error, but
-            // since it isn't, all we can do is log an error and proceed
-            // with the wait().
-            error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
-        }
-
-        match child.wait() {
-            Ok(exit_status) => {
-                info!(exit_status = %exit_status, "wait successful");
-            }
-            Err(e) => {
-                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
-            }
-        }
-    }
-}
-
-impl Drop for NoLeakChild {
-    fn drop(&mut self) {
-        let child = match self.child.take() {
-            Some(child) => child,
-            None => return,
-        };
-        let tenant_shard_id = self.tenant_id;
-        // Offload the kill+wait of the child process into the background.
-        // If someone stops the runtime, we'll leak the child process.
-        // We can ignore that case because we only stop the runtime on pageserver exit.
-        tokio::runtime::Handle::current().spawn(async move {
-            tokio::task::spawn_blocking(move || {
-                // Intentionally don't inherit the tracing context from whoever is dropping us.
-                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
-                let _entered = span.enter();
-                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
-            })
-            .await
-        });
-    }
-}
-
-pub(crate) trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
-}
-
-impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
-        NoLeakChild::spawn(tenant_id, self)
-    }
-}
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -1,57 +0,0 @@
-use bytes::BufMut;
-use pageserver_api::reltag::RelTag;
-use serde::Serialize;
-use utils::bin_ser::BeSer;
-use utils::lsn::Lsn;
-
-///
-/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
-///
-/// In Postgres `BufferTag` structure is used for exactly the same purpose.
-/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
-///
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
-pub(crate) struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'B');
-    buf.put_u32(len as u32);
-
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
-
-pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
-    assert!(base_img.len() == 8192);
-
-    let len = 4 + 1 + 4 * 4 + base_img.len();
-
-    buf.put_u8(b'P');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-    buf.put(base_img);
-}
-
-pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
-    let len = 4 + 8 + rec.len();
-
-    buf.put_u8(b'A');
-    buf.put_u32(len as u32);
-    buf.put_u64(endlsn.0);
-    buf.put(rec);
-}
-
-pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
-    let len = 4 + 1 + 4 * 4;
-
-    buf.put_u8(b'G');
-    buf.put_u32(len as u32);
-    tag.ser_into(buf)
-        .expect("serialize BufferTag should always succeed");
-}
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
+#ifdef NEON_SMGR
+	smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -314,9 +314,6 @@ lfc_change_limit_hook(int newval, void *extra)
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
-	if (new_size == 0) {
-		lfc_ctl->generation += 1;
-	}
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -11,23 +11,16 @@
 #include "postgres.h"
 #include "fmgr.h"

-#include "miscadmin.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
 #include "replication/walsender.h"
-#include "storage/procsignal.h"
-#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
-#include "utils/wait_event.h"

 #include "neon.h"
 #include "walproposer.h"
@@ -37,130 +30,6 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);

-static int	logical_replication_max_time_lag = 3600;
-
-static void
-InitLogicalReplicationMonitor(void)
-{
-	BackgroundWorker bgw;
-
-	DefineCustomIntVariable(
-		"neon.logical_replication_max_time_lag",
-		"Threshold for dropping unused logical replication slots",
-		NULL,
-		&logical_replication_max_time_lag,
-		3600, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_S,
-		NULL, NULL, NULL);
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-typedef struct
-{
-	NameData    name;
-	bool        dropped;
-	XLogRecPtr  confirmed_flush_lsn;
-	TimestampTz last_updated;
-} SlotStatus;
-
-/*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
- */
-PGDLLEXPORT void
-LogicalSlotsMonitorMain(Datum main_arg)
-{
-	SlotStatus* slots;
-	TimestampTz now, last_checked;
-
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
-	last_checked = GetCurrentTimestamp();
-
-	for (;;)
-	{
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 logical_replication_max_time_lag*1000/2,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
-
-		now = GetCurrentTimestamp();
-
-		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
-		{
-			int n_active_slots = 0;
-			last_checked = now;
-
-			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-			for (int i = 0; i < max_replication_slots; i++)
-			{
-				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-
-				/* Consider only logical repliction slots */
-				if (!s->in_use || !SlotIsLogical(s))
-					continue;
-
-				if (s->active_pid != 0)
-				{
-					n_active_slots += 1;
-					continue;
-				}
-
-				/* Check if there was some activity with the slot since last check */
-				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
-				{
-					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
-					slots[i].last_updated = now;
-				}
-				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
-				{
-					slots[i].name = s->data.name;
-					slots[i].dropped = true;
-				}
-			}
-			LWLockRelease(ReplicationSlotControlLock);
-
-			/*
-			 * If there are no active subscriptions, then no new snapshots are generated
-			 * and so no need to force slot deletion.
-			 */
-			if (n_active_slots != 0)
-			{
-				for (int i = 0; i < max_replication_slots; i++)
-				{
-					if (slots[i].dropped)
-					{
-						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
-							 (now - slots[i].last_updated)/USECS_PER_SEC);
-						ReplicationSlotDrop(slots[i].name.data, true);
-						slots[i].dropped = false;
-					}
-				}
-			}
-		}
-	}
-}
-
 void
 _PG_init(void)
 {
@@ -175,8 +44,6 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();

-	InitLogicalReplicationMonitor();
-
 	InitControlPlaneConnector();

 	pg_init_extension_server();
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
 */
 #include "postgres.h"

+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2712,10 +2713,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;

 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }

 /*
@@ -2739,7 +2744,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }

 /*
--- a/Show More
+++ b/Show More