tests: extend test_sharding for splitting

control_plane: support for shard splitting
pageserver: implement shard splitting
2026-07-04 12:40:37 +00:00 · 2024-01-03 15:51:05 +00:00 · 2024-01-03 15:51:05 +00:00 · 2024-01-03 15:51:05 +00:00 · 2024-01-03 15:51:05 +00:00 · 2024-01-03 15:43:56 +00:00
259 changed files with 6752 additions and 17989 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = "1m"
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,15 +69,7 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64

  kaniko-arm:
    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -93,15 +85,7 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build
-        run: |
-          /kaniko/executor \
-            --reproducible \
-            --snapshotMode=redo \
-            --skip-unused-stages \
-            --dockerfile ${{ inputs.dockerfile-path }} \
-            --cache=true \
-            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64

  manifest:
    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -115,10 +99,7 @@ jobs:

    steps:
      - name: Create manifest
-        run: |
-          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
-                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64

      - name: Push manifest
        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,8 +21,6 @@ env:
  COPT: '-Werror'
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
-  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

 jobs:
  check-permissions:
@@ -46,20 +44,6 @@ jobs:

        exit 1

-  cancel-previous-e2e-tests:
-    needs: [ check-permissions ]
-    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Cancel previous e2e-tests runs for this PR
-        env:
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        run: |
-          gh workflow --repo neondatabase/cloud \
-            run cancel-previous-in-concurrency-group.yml \
-              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
-
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
@@ -121,11 +105,11 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
+      - name: Run ruff to ensure code format
+        run: poetry run ruff .

-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
+      - name: Run black to ensure code format
+        run: poetry run black --diff --check .

      - name: Run mypy to check types
        run: poetry run mypy .
@@ -202,11 +186,7 @@ jobs:
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Raise locked memory limit for tokio-epoll-uring.
-      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
-      # io_uring will account the memory of the CQ and SQ as locked.
-      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      options: --init
    strategy:
      fail-fast: false
      matrix:
@@ -360,12 +340,8 @@ jobs:
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
        run: |
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -443,8 +419,8 @@ jobs:
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
    strategy:
      fail-fast: false
      matrix:
@@ -472,7 +448,6 @@ jobs:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -483,13 +458,12 @@ jobs:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      # Default shared memory is 64mb
+      options: --init --shm-size=512mb
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
-        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
@@ -503,12 +477,11 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -722,8 +695,7 @@ jobs:
                \"commit_hash\": \"$COMMIT_SHA\",
                \"remote_repo\": \"${{ github.repository }}\",
                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
              }
            }"

@@ -1159,7 +1131,7 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
      # Hence keeping target/ (and general cache size) smaller
      BUILD_TYPE: release
      CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --release
+      CARGO_FLAGS: --locked --release
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -210,20 +210,18 @@ jobs:

      - name: Run cargo build
        run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      - name: Run cargo test
-        env:
-          NEXTEST_RETRIES: 3
        run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo test $CARGO_FLAGS $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -233,7 +231,7 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
    timeout-minutes: 90
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,51 +20,111 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
 permissions: {}

 jobs:
  tag-image:
    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye

    env:
-      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
      FROM_TAG: ${{ inputs.from-tag }}
      TO_TAG: ${{ inputs.to-tag }}

    steps:
-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
+      - name: Install Crane & ECR helper
        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1

-      - uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v2
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install crane
+      - name: Configure ECR login
        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

-      - name: Copy images
+      - name: Restore previous tag if needed
        run: |
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
-          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"

-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,26 +99,23 @@ libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
 native-tls = "0.2"
-nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+nix = "0.26"
+notify = "5.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.12.0"
+opentelemetry = "0.19.0"
+opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -151,7 +148,6 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
@@ -163,10 +159,10 @@ toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.20.0"
+tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
-uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
+uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
 x509-parser = "0.15"
@@ -220,10 +216,6 @@ tonic-build = "0.9"
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

-# bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-
 ################# Binary contents sections

 [profile.release]
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.75.0
+ENV RUSTC_VERSION=1.74.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -52,7 +52,7 @@ RUN cd postgres && \
    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
    # In vanilla postgres this function is limited to Postgres role superuser.
    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
    # so we do it here.
    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
        fi; \
    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7,
+    # the second loop is for pg_stat_statement extension versions >= 1.7, 
    # where pg_stat_statement_reset() got 3 additional arguments
    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
        filename=$(basename "$file"); \
        if ! echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
        fi; \
-    done
+    done      

 #########################################################################################
 #
@@ -143,24 +143,29 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
-    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
-    # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
-    cp upgrade/* /usr/local/pgsql/share/extension/ && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
-    # don't break computes with installed old version of plv8
-    cd /usr/local/pgsql/lib/ && \
-    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
-    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -546,7 +551,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
-        -D RDK_INSTALL_COMIC_FONTS=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -613,7 +617,6 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -776,8 +779,6 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
-ARG PG_VERSION
-
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
@@ -882,10 +883,8 @@ FROM debian:bullseye-slim
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    mkdir /var/db/postgres/pgbouncer && \
    chown -R postgres:postgres /var/db/postgres && \
    chmod 0750 /var/db/postgres/compute && \
-    chmod 0750 /var/db/postgres/pgbouncer && \
    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
    # create folder for file cache
    mkdir -p -m 777 /neon/cache
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -32,6 +32,8 @@
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
+//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
+//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -110,6 +112,9 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

+    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
+    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
+
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -220,13 +225,15 @@ fn main() -> Result<()> {
        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
+        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
+        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
    };
    let compute = Arc::new(compute_node);

    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps Postgres start quicker later,
+    // available for binding. Prewarming helps postgres start quicker later,
    // because QEMU will already have it's memory allocated from the host, and
-    // the necessary binaries will already be cached.
+    // the necessary binaries will alreaady be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }
@@ -269,11 +276,6 @@ fn main() -> Result<()> {

    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
-
-    info!(
-        "running compute with features: {:?}",
-        state.pspec.as_ref().unwrap().spec.features
-    );
    drop(state);

    // Launch remaining service threads
@@ -286,7 +288,7 @@ fn main() -> Result<()> {
    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
-            error!("could not start the compute node: {:#}", err);
+            error!("could not start the compute node: {:?}", err);
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
@@ -348,7 +350,7 @@ fn main() -> Result<()> {

    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    if let Some((mut pg, logs_handle)) = pg {
+    if let Some(mut pg) = pg {
        // Startup is finished, exit the startup tracing span
        drop(startup_context_guard);

@@ -356,12 +358,6 @@ fn main() -> Result<()> {
            .wait()
            .expect("failed to start waiting on Postgres process");
        PG_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
-
        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
    }
@@ -516,6 +512,23 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
+        .arg(
+            Arg::new("pgbouncer-connstr")
+                .long("pgbouncer-connstr")
+                .default_value(
+                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
+                )
+                .value_name("PGBOUNCER_CONNSTR"),
+        )
+        .arg(
+            Arg::new("pgbouncer-ini-path")
+                .long("pgbouncer-ini-path")
+                // Note: this doesn't match current path for pgbouncer.ini.
+                // Until we fix it, we need to pass the path explicitly
+                // or this will be effectively no-op.
+                .default_value("/etc/pgbouncer.ini")
+                .value_name("PGBOUNCER_INI_PATH"),
+        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,7 +20,7 @@ use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio;
 use tokio_postgres;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -31,7 +31,6 @@ use utils::measured_stream::MeasuredReader;
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
-use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -71,6 +70,10 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
+    // connection string to pgbouncer to change settings
+    pub pgbouncer_connstr: Option<String>,
+    // path to pgbouncer.ini to change settings
+    pub pgbouncer_ini_path: Option<String>,
 }

 // store some metrics about download size that might impact startup time
@@ -276,7 +279,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
            $$;"#,
        roles_decl, database_decl,
    );
-    info!("Neon superuser created: {}", inlinify(&query));
+    info!("Neon superuser created:\n{}", &query);
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
@@ -492,7 +495,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let mut sync_handle = maybe_cgexec(&self.pgbin)
+        let sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -501,30 +504,18 @@ impl ComputeNode {
                vec![]
            })
            .stdout(Stdio::piped())
-            .stderr(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);

        // `postgres --sync-safekeepers` will print all log output to stderr and
-        // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
-        // will be collected in a child thread.
-        let stderr = sync_handle
-            .stderr
-            .take()
-            .expect("stderr should be captured");
-        let logs_handle = handle_postgres_logs(stderr);
-
+        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
+        // redirected to the caller output.
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);

-        // Process has exited, so we can join the logs thread.
-        let _ = logs_handle
-            .join()
-            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
-
        if !sync_output.status.success() {
            anyhow::bail!(
                "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}",
@@ -661,12 +652,11 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    /// Returns a handle to the child process and a handle to the logs thread.
    #[instrument(skip_all)]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    ) -> Result<std::process::Child> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
@@ -677,18 +667,13 @@ impl ComputeNode {
            } else {
                vec![]
            })
-            .stderr(Stdio::piped())
            .spawn()
            .expect("cannot start postgres process");
        PG_PID.store(pg.id(), Ordering::SeqCst);

-        // Start a thread to collect logs from stderr.
-        let stderr = pg.stderr.take().expect("stderr should be captured");
-        let logs_handle = handle_postgres_logs(stderr);
-
        wait_for_postgres(&mut pg, pgdata_path)?;

-        Ok((pg, logs_handle))
+        Ok(pg)
    }

    /// Do initial configuration of the already started Postgres.
@@ -700,14 +685,13 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
-        let mut client = match Client::connect(connstr.as_str(), NoTls) {
+        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let mut zenith_admin_connstr = connstr.clone();
+                let mut zenith_admin_connstr = self.connstr.clone();

                zenith_admin_connstr
                    .set_username("zenith_admin")
@@ -720,8 +704,8 @@ impl ComputeNode {
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

-                // reconnect with connstring with expected name
-                Client::connect(connstr.as_str(), NoTls)?
+                // reconnect with connsting with expected name
+                Client::connect(self.connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };
@@ -735,8 +719,8 @@ impl ComputeNode {
        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, connstr.as_str())?;
+        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
@@ -744,12 +728,6 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

-        if self.has_feature(ComputeFeature::Migrations) {
-            thread::spawn(move || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
-                handle_migrations(&mut client)
-            });
-        }
        Ok(())
    }

@@ -772,8 +750,8 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

-        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
-            info!("tuning pgbouncer");
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -782,9 +760,15 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pgbouncer_settings.clone();
+            let pgbouncer_settings = spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -814,10 +798,6 @@ impl ComputeNode {
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
-            // We can skip handle_migrations here because a new migration can only appear
-            // if we have a new version of the compute_ctl binary, which can only happen
-            // if compute got restarted, in which case we'll end up inside of apply_config
-            // instead of reconfigure.
        }

        // 'Close' connection
@@ -838,10 +818,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-        extension_server_port: u16,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -853,8 +830,8 @@ impl ComputeNode {
        );

        // tune pgbouncer
-        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
-            info!("tuning pgbouncer");
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -863,9 +840,15 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pgbouncer_settings.clone();
+            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -906,7 +889,7 @@ impl ComputeNode {
        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
-        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
@@ -956,17 +939,7 @@ impl ComputeNode {
        };
        info!(?metrics, "compute start finished");

-        Ok(pg_process)
-    }
-
-    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
-    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
-        let mut state = self.state.lock().unwrap();
-        // NB: `Some(<DateTime>)` is always greater than `None`.
-        if last_active > state.last_active {
-            state.last_active = last_active;
-            debug!("set the last compute activity time to: {:?}", last_active);
-        }
+        Ok(pg)
    }

    // Look for core dumps and collect backtraces.
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {

    Ok(())
 }
-
-/// Replace all newline characters with a special character to make it
-/// easier to grep for log messages.
-pub fn inlinify(s: &str) -> String {
-    s.replace('\n', "\u{200B}")
-}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,165 +3,97 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, error, info, warn};
+use tracing::{debug, info};

 use crate::compute::ComputeNode;
-use compute_api::responses::ComputeStatus;
-use compute_api::spec::ComputeFeature;

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
-// NB: the only expected panic is at `Mutex` unwrap(), all other errors
-// should be handled gracefully.
+// XXX: the only expected panic is at `RwLock` unwrap().
 fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
-
-    // During startup and configuration we connect to every Postgres database,
-    // but we don't want to count this as some user activity. So wait until
-    // the compute fully started before monitoring activity.
-    wait_for_postgres_start(compute);
-
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);

-    let mut sleep = false;
-    let mut prev_active_time: Option<f64> = None;
-    let mut prev_sessions: Option<i64> = None;
-
-    if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-        info!("starting experimental activity monitor for {}", connstr);
-    } else {
-        info!("starting activity monitor for {}", connstr);
-    }
+    info!("watching Postgres activity at {}", connstr);

    loop {
-        // We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
-        // But skip the first sleep, so we can connect to Postgres immediately.
-        if sleep {
-            // Should be outside of the mutex lock to allow others to read while we sleep.
-            thread::sleep(MONITOR_CHECK_INTERVAL);
-        } else {
-            sleep = true;
-        }
+        // Should be outside of the write lock to allow others to read while we sleep.
+        thread::sleep(MONITOR_CHECK_INTERVAL);

        match &mut client {
            Ok(cli) => {
                if cli.is_closed() {
-                    info!("connection to Postgres is closed, trying to reconnect");
+                    info!("connection to postgres closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
                    client = Client::connect(connstr, NoTls);
                    continue;
                }

-                // This is a new logic, only enable if the feature flag is set.
-                // TODO: remove this once we are sure that it works OR drop it altogether.
-                if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-                    // First, check if the total active time or sessions across all databases has changed.
-                    // If it did, it means that user executed some queries. In theory, it can even go down if
-                    // some databases were dropped, but it's still a user activity.
-                    match get_database_stats(cli) {
-                        Ok((active_time, sessions)) => {
-                            let mut detected_activity = false;
+                // Get all running client backends except ourself, use RFC3339 DateTime format.
+                let backends = cli
+                    .query(
+                        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+                         FROM pg_stat_activity
+                         WHERE backend_type = 'client backend'
+                            AND pid != pg_backend_pid()
+                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
+                        &[],
+                    );
+                let mut last_active = compute.state.lock().unwrap().last_active;

-                            prev_active_time = match prev_active_time {
-                                Some(prev_active_time) => {
-                                    if active_time != prev_active_time {
-                                        detected_activity = true;
-                                    }
-                                    Some(active_time)
-                                }
-                                None => Some(active_time),
+                if let Ok(backs) = backends {
+                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
+
+                    for b in backs.into_iter() {
+                        let state: String = match b.try_get("state") {
+                            Ok(state) => state,
+                            Err(_) => continue,
+                        };
+
+                        if state == "idle" {
+                            let change: String = match b.try_get("state_change") {
+                                Ok(state_change) => state_change,
+                                Err(_) => continue,
                            };
-                            prev_sessions = match prev_sessions {
-                                Some(prev_sessions) => {
-                                    if sessions != prev_sessions {
-                                        detected_activity = true;
-                                    }
-                                    Some(sessions)
+                            let change = DateTime::parse_from_rfc3339(&change);
+                            match change {
+                                Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
+                                Err(e) => {
+                                    info!("cannot parse backend state_change DateTime: {}", e);
+                                    continue;
                                }
-                                None => Some(sessions),
-                            };
-
-                            if detected_activity {
-                                // Update the last active time and continue, we don't need to
-                                // check backends state change.
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
                            }
+                        } else {
+                            // Found non-idle backend, so the last activity is NOW.
+                            // Save it and exit the for loop. Also clear the idle backend
+                            // `state_change` timestamps array as it doesn't matter now.
+                            last_active = Some(Utc::now());
+                            idle_backs.clear();
+                            break;
                        }
-                        Err(e) => {
-                            error!("could not get database statistics: {}", e);
-                            continue;
-                        }
+                    }
+
+                    // Get idle backend `state_change` with the max timestamp.
+                    if let Some(last) = idle_backs.iter().max() {
+                        last_active = Some(*last);
                    }
                }

-                // Second, if database statistics is the same, check all backends state change,
-                // maybe there is some with more recent activity. `get_backends_state_change()`
-                // can return None or stale timestamp, so it's `compute.update_last_active()`
-                // responsibility to check if the new timestamp is more recent than the current one.
-                // This helps us to discover new sessions, that did nothing yet.
-                match get_backends_state_change(cli) {
-                    Ok(last_active) => {
-                        compute.update_last_active(last_active);
-                    }
-                    Err(e) => {
-                        error!("could not get backends state change: {}", e);
-                    }
-                }
-
-                // Finally, if there are existing (logical) walsenders, do not suspend.
-                //
-                // walproposer doesn't currently show up in pg_stat_replication,
-                // but protect if it will be
-                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
-                match cli.query_one(ws_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_ws) => {
-                            if num_ws > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse walsenders count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of walsenders: {:?}", e);
-                        continue;
-                    }
-                }
-                //
-                // Do not suspend compute if autovacuum is running
-                //
-                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
-                match cli.query_one(autovacuum_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_workers) => {
-                            if num_workers > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse autovacuum workers count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of autovacuum workers: {:?}", e);
-                        continue;
-                    }
+                // Update the last activity in the shared state if we got a more recent one.
+                let mut state = compute.state.lock().unwrap();
+                // NB: `Some(<DateTime>)` is always greater than `None`.
+                if last_active > state.last_active {
+                    state.last_active = last_active;
+                    debug!("set the last compute activity time to: {:?}", last_active);
                }
            }
            Err(e) => {
-                debug!("could not connect to Postgres: {}, retrying", e);
+                debug!("cannot connect to postgres: {}, retrying", e);

                // Establish a new connection and try again.
                client = Client::connect(connstr, NoTls);
@@ -170,124 +102,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
    }
 }

-// Hang on condition variable waiting until the compute status is `Running`.
-fn wait_for_postgres_start(compute: &ComputeNode) {
-    let mut state = compute.state.lock().unwrap();
-    while state.status != ComputeStatus::Running {
-        info!("compute is not running, waiting before monitoring activity");
-        state = compute.state_changed.wait(state).unwrap();
-
-        if state.status == ComputeStatus::Running {
-            break;
-        }
-    }
-}
-
-// Figure out the total active time and sessions across all non-system databases.
-// Returned tuple is `(active_time, sessions)`.
-// It can return `0.0` active time or `0` sessions, which means no user databases exist OR
-// it was a start with skipped `pg_catalog` updates and user didn't do any queries
-// (or open any sessions) yet.
-fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
-    // Filter out `postgres` database as `compute_ctl` and other monitoring tools
-    // like `postgres_exporter` use it to query Postgres statistics.
-    // Use explicit 8 bytes type casts to match Rust types.
-    let stats = cli.query_one(
-        "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
-            coalesce(sum(sessions), 0)::bigint AS total_sessions
-        FROM pg_stat_database
-        WHERE datname NOT IN (
-                'postgres',
-                'template0',
-                'template1'
-            );",
-        &[],
-    );
-    let stats = match stats {
-        Ok(stats) => stats,
-        Err(e) => {
-            return Err(anyhow::anyhow!("could not query active_time: {}", e));
-        }
-    };
-
-    let active_time: f64 = match stats.try_get("total_active_time") {
-        Ok(active_time) => active_time,
-        Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)),
-    };
-
-    let sessions: i64 = match stats.try_get("total_sessions") {
-        Ok(sessions) => sessions,
-        Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)),
-    };
-
-    Ok((active_time, sessions))
-}
-
-// Figure out the most recent state change time across all client backends.
-// If there is currently active backend, timestamp will be `Utc::now()`.
-// It can return `None`, which means no client backends exist or we were
-// unable to parse the timestamp.
-fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime<Utc>>> {
-    let mut last_active: Option<DateTime<Utc>> = None;
-    // Get all running client backends except ourself, use RFC3339 DateTime format.
-    let backends = cli.query(
-        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
-                FROM pg_stat_activity
-                    WHERE backend_type = 'client backend'
-                    AND pid != pg_backend_pid()
-                    AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
-        &[],
-    );
-
-    match backends {
-        Ok(backs) => {
-            let mut idle_backs: Vec<DateTime<Utc>> = vec![];
-
-            for b in backs.into_iter() {
-                let state: String = match b.try_get("state") {
-                    Ok(state) => state,
-                    Err(_) => continue,
-                };
-
-                if state == "idle" {
-                    let change: String = match b.try_get("state_change") {
-                        Ok(state_change) => state_change,
-                        Err(_) => continue,
-                    };
-                    let change = DateTime::parse_from_rfc3339(&change);
-                    match change {
-                        Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
-                        Err(e) => {
-                            info!("cannot parse backend state_change DateTime: {}", e);
-                            continue;
-                        }
-                    }
-                } else {
-                    // Found non-idle backend, so the last activity is NOW.
-                    // Return immediately, no need to check other backends.
-                    return Ok(Some(Utc::now()));
-                }
-            }
-
-            // Get idle backend `state_change` with the max timestamp.
-            if let Some(last) = idle_backs.iter().max() {
-                last_active = Some(*last);
-            }
-        }
-        Err(e) => {
-            return Err(anyhow::anyhow!("could not query backends: {}", e));
-        }
-    }
-
-    Ok(last_active)
-}
-
 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
-    let compute = Arc::clone(compute);
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+    let state = Arc::clone(state);

    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&compute))
+        .spawn(move || watch_compute_activity(&state))
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,15 +6,12 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tokio::io::AsyncBufReadExt;
-use tokio::time::timeout;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};

@@ -366,7 +363,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 }

 /// Update pgbouncer.ini with provided options
-fn update_pgbouncer_ini(
+pub fn update_pgbouncer_ini(
    pgbouncer_config: HashMap<String, String>,
    pgbouncer_ini_path: &str,
 ) -> Result<()> {
@@ -375,10 +372,6 @@ fn update_pgbouncer_ini(

    for (option_name, value) in pgbouncer_config.iter() {
        section.insert(option_name, value);
-        debug!(
-            "Updating pgbouncer.ini with new values {}={}",
-            option_name, value
-        );
    }

    conf.write_to_file(pgbouncer_ini_path)?;
@@ -388,147 +381,47 @@ fn update_pgbouncer_ini(
 /// Tune pgbouncer.
 /// 1. Apply new config using pgbouncer admin console
 /// 2. Add new values to pgbouncer.ini to preserve them after restart
-pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
-    let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
-        // for VMs use pgbouncer specific way to connect to
-        // pgbouncer admin console without password
-        // when pgbouncer is running under the same user.
-        "host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
-    } else {
-        // for k8s use normal connection string with password
-        // to connect to pgbouncer admin console
-        let mut pgbouncer_connstr =
-            "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
-        if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
-            pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
-        }
-        pgbouncer_connstr
-    };
-
-    info!(
-        "Connecting to pgbouncer with connection string: {}",
-        pgbouncer_connstr
-    );
-
-    // connect to pgbouncer, retrying several times
-    // because pgbouncer may not be ready yet
-    let mut retries = 3;
-    let client = loop {
-        match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
-            Ok((client, connection)) => {
-                tokio::spawn(async move {
-                    if let Err(e) = connection.await {
-                        eprintln!("connection error: {}", e);
-                    }
-                });
-                break client;
+pub async fn tune_pgbouncer(
+    pgbouncer_settings: Option<HashMap<String, String>>,
+    pgbouncer_connstr: &str,
+    pgbouncer_ini_path: Option<String>,
+) -> Result<()> {
+    if let Some(pgbouncer_config) = pgbouncer_settings {
+        // Apply new config
+        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
            }
-            Err(e) => {
-                if retries == 0 {
-                    return Err(e.into());
-                }
-                error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
-                retries -= 1;
-                tokio::time::sleep(Duration::from_secs(1)).await;
-            }
-        }
-    };
-
-    // Apply new config
-    for (option_name, value) in pgbouncer_config.iter() {
-        let query = format!("SET {}={}", option_name, value);
-        // keep this log line for debugging purposes
-        info!("Applying pgbouncer setting change: {}", query);
-
-        if let Err(err) = client.simple_query(&query).await {
-            // Don't fail on error, just print it into log
-            error!(
-                "Failed to apply pgbouncer setting change: {},  {}",
-                query, err
-            );
-        };
-    }
-
-    // save values to pgbouncer.ini
-    // so that they are preserved after pgbouncer restart
-    let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
-        // in VMs we use /etc/pgbouncer.ini
-        "/etc/pgbouncer.ini".to_string()
-    } else {
-        // in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
-        // this is a shared volume between pgbouncer and postgres containers
-        // FIXME: fix permissions for this file
-        "/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
-    };
-    update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
-
-    Ok(())
-}
-
-/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
-/// and send them to the logger. In the future we may also want to add context to
-/// these logs.
-pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
-    std::thread::spawn(move || {
-        let runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to build tokio runtime");
-
-        let res = runtime.block_on(async move {
-            let stderr = tokio::process::ChildStderr::from_std(stderr)?;
-            handle_postgres_logs_async(stderr).await
        });
-        if let Err(e) = res {
-            tracing::error!("error while processing postgres logs: {}", e);
-        }
-    })
-}

-/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
-/// - next line starts with timestamp
-/// - EOF
-/// - no new lines were written for the last second
-async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
-    let mut lines = tokio::io::BufReader::new(stderr).lines();
-    let timeout_duration = Duration::from_millis(100);
-    let ts_regex =
-        regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
+        for (option_name, value) in pgbouncer_config.iter() {
+            info!(
+                "Applying pgbouncer setting change: {} = {}",
+                option_name, value
+            );
+            let query = format!("SET {} = {}", option_name, value);

-    let mut buf = vec![];
-    loop {
-        let next_line = timeout(timeout_duration, lines.next_line()).await;
+            let result = client.simple_query(&query).await;

-        // we should flush lines from the buffer if we cannot continue reading multiline message
-        let should_flush_buf = match next_line {
-            // Flushing if new line starts with timestamp
-            Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
-            // Flushing on EOF, timeout or error
-            _ => true,
-        };
+            info!("Applying pgbouncer setting change: {}", query);
+            info!("pgbouncer setting change result: {:?}", result);

-        if !buf.is_empty() && should_flush_buf {
-            // join multiline message into a single line, separated by unicode Zero Width Space.
-            // "PG:" suffix is used to distinguish postgres logs from other logs.
-            let combined = format!("PG:{}\n", buf.join("\u{200B}"));
-            buf.clear();
-
-            // sync write to stderr to avoid interleaving with other logs
-            use std::io::Write;
-            let res = std::io::stderr().lock().write_all(combined.as_bytes());
-            if let Err(e) = res {
-                tracing::error!("error while writing to stderr: {}", e);
-            }
-        }
-
-        // if not timeout, append line to the buffer
-        if next_line.is_ok() {
-            match next_line?? {
-                Some(line) => buf.push(line),
-                // EOF
-                None => break,
+            if let Err(err) = result {
+                // Don't fail on error, just print it into log
+                error!(
+                    "Failed to apply pgbouncer setting change: {},  {}",
+                    query, err
+                );
            };
        }
+
+        // save values to pgbouncer.ini
+        // so that they are preserved after pgbouncer restart
+        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
+            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+        }
    }

    Ok(())
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -9,7 +9,6 @@ use reqwest::StatusCode;
 use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
-use crate::logger::inlinify;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -190,20 +189,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres roles (only in debug mode)
    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
+        info!("postgres roles:");
        for r in &existing_roles {
-            vec.push(format!(
-                "{}:{}",
+            info!(
+                "    - {}:{}",
                r.name,
                if r.encrypted_password.is_some() {
                    "[FILTERED]"
                } else {
                    "(null)"
                }
-            ));
+            );
        }
-
-        info!("postgres roles (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -241,10 +238,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres roles info to handle possible roles renaming
    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;

-    info!(
-        "handling cluster spec roles (total {})",
-        spec.cluster.roles.len()
-    );
+    info!("cluster spec roles:");
    for role in &spec.cluster.roles {
        let name = &role.name;
        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
@@ -307,7 +301,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
-                info!("running role create query: '{}'", &query);
+                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
@@ -324,7 +318,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                RoleAction::Create => " -> create",
                RoleAction::Update => " -> update",
            };
-            info!(" - {}:{}{}", name, pwd, action_str);
+            info!("   - {}:{}{}", name, pwd, action_str);
        }
    }

@@ -433,11 +427,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
+        info!("postgres databases:");
        for (dbname, db) in &existing_dbs {
-            vec.push(format!("{}:{}", dbname, db.owner));
+            info!("    {}:{}", dbname, db.owner);
        }
-        info!("postgres databases (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -509,10 +502,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres databases info to handle possible renames
    let existing_dbs = get_existing_dbs(client)?;

-    info!(
-        "handling cluster spec databases (total {})",
-        spec.cluster.databases.len()
-    );
+    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
        let pg_db = existing_dbs.get(name);
@@ -571,7 +561,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                DatabaseAction::Create => " -> create",
                DatabaseAction::Update => " -> update",
            };
-            info!(" - {}:{}{}", db.name, db.owner, action_str);
+            info!("   - {}:{}{}", db.name, db.owner, action_str);
        }
    }

@@ -672,11 +662,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
            $$;"
        .to_string();

-        info!(
-            "grant query for db {} : {}",
-            &db.name,
-            inlinify(&grant_query)
-        );
+        info!("grant query for db {} : {}", &db.name, &grant_query);
        db_client.simple_query(&grant_query)?;
    }

@@ -727,79 +713,3 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {

    Ok(())
 }
-
-#[instrument(skip_all)]
-pub fn handle_migrations(client: &mut Client) -> Result<()> {
-    info!("handle migrations");
-
-    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
-    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-    let migrations = [
-        "ALTER ROLE neon_superuser BYPASSRLS",
-        r#"
-DO $$
-DECLARE
-    role_name text;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
-    END LOOP;
-
-    FOR role_name IN SELECT rolname FROM pg_roles
-        WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
-    END LOOP;
-END $$;
-"#,
-    ];
-
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-    client.simple_query(query)?;
-
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
-
-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
-
-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
-
-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
-
-    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    query = "BEGIN";
-    client.simple_query(query)?;
-
-    while current_migration < migrations.len() {
-        info!("Running migration:\n{}\n", migrations[current_migration]);
-        client.simple_query(migrations[current_migration])?;
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client.simple_query(&setval)?;
-
-    query = "COMMIT";
-    client.simple_query(query)?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
-    Ok(())
-}
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,8 +10,6 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-diesel = { version = "2.1.4", features = ["postgres"]}
-diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
@@ -21,7 +19,6 @@ hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
-scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -6,14 +6,12 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-camino.workspace = true
 clap.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
-pageserver_client.workspace = true
 postgres_connection.workspace = true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -21,14 +19,7 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true

-# TODO: remove this after DB persistence is added, it is only used for
-# a parsing function when loading pageservers from neon_local LocalEnv
-postgres_backend.workspace = true
-
-diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
-
 utils = { path = "../../libs/utils/" }
-metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -1,6 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
-
-DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
-DROP FUNCTION IF EXISTS diesel_set_updated_at();
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -1,36 +0,0 @@
-- This file was automatically created by Diesel to setup helper functions
-- and other internal bookkeeping. This file is safe to edit, any future
-- changes will be added to existing projects as new migrations.
-
-
-
-
-- Sets up a trigger for the given table to automatically set a column called
-- `updated_at` whenever the row is modified (unless `updated_at` was included
-- in the modified columns)
--
-- # Example
--
-- ```sql
-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
--
-- SELECT diesel_manage_updated_at('users');
-- ```
-CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
-BEGIN
-    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
-                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
-END;
-$$ LANGUAGE plpgsql;
-
-CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
-BEGIN
-    IF (
-        NEW IS DISTINCT FROM OLD AND
-        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
-    ) THEN
-        NEW.updated_at := current_timestamp;
-    END IF;
-    RETURN NEW;
-END;
-$$ LANGUAGE plpgsql;
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -1 +0,0 @@
-DROP TABLE tenant_shards;
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -1,12 +0,0 @@
-CREATE TABLE tenant_shards (
-  tenant_id VARCHAR NOT NULL,
-  shard_number INTEGER NOT NULL,
-  shard_count INTEGER NOT NULL,
-  PRIMARY KEY(tenant_id, shard_number, shard_count),
-  shard_stripe_size INTEGER NOT NULL,
-  generation INTEGER NOT NULL,
-  generation_pageserver BIGINT NOT NULL,
-  placement_policy VARCHAR NOT NULL,
-  -- config is JSON encoded, opaque to the database.
-  config TEXT NOT NULL
-);
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -1 +0,0 @@
-DROP TABLE nodes;
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -1,10 +0,0 @@
-CREATE TABLE nodes (
-  node_id BIGINT PRIMARY KEY NOT NULL,
-
-  scheduling_policy VARCHAR NOT NULL,
-
-  listen_http_addr VARCHAR NOT NULL,
-  listen_http_port INTEGER NOT NULL,
-  listen_pg_addr VARCHAR NOT NULL,
-  listen_pg_port INTEGER NOT NULL
-);
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;

 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use postgres_connection::parse_host_port;
 use utils::id::{NodeId, TenantId};

@@ -25,17 +25,9 @@ impl ComputeHookTenant {
        self.shards
            .sort_by_key(|(shard, _node_id)| shard.shard_number);

-        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+        if self.shards.len() == shard_count.0 as usize {
            // We have pageservers for all the shards: proceed to reconfigure compute
-            let env = match LocalEnv::load_config() {
-                Ok(e) => e,
-                Err(e) => {
-                    tracing::warn!(
-                        "Couldn't load neon_local config, skipping compute update ({e})"
-                    );
-                    return Ok(());
-                }
-            };
+            let env = LocalEnv::load_config().expect("Error loading config");
            let cplane = ComputeControlPlane::load(env.clone())
                .expect("Error loading compute control plane");

--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,12 +1,11 @@
 use crate::reconciler::ReconcileError;
-use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
+use crate::service::Service;
+use hyper::StatusCode;
 use hyper::{Body, Request, Response};
-use hyper::{StatusCode, Uri};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{TenantCreateRequest, TenantShardSplitRequest, TimelineCreateRequest};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
-use utils::auth::SwappableJwtAuth;
-use utils::http::endpoint::{auth_middleware, request_span};
+use utils::http::endpoint::request_span;
 use utils::http::request::parse_request_param;
 use utils::id::TenantId;

@@ -31,21 +30,11 @@ use control_plane::attachment_service::{
 #[derive(Clone)]
 pub struct HttpState {
    service: Arc<crate::service::Service>,
-    auth: Option<Arc<SwappableJwtAuth>>,
-    allowlist_routes: Vec<Uri>,
 }

 impl HttpState {
-    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
-        let allowlist_routes = ["/status"]
-            .iter()
-            .map(|v| v.parse().unwrap())
-            .collect::<Vec<_>>();
-        Self {
-            service,
-            auth,
-            allowlist_routes,
-        }
+    pub fn new(service: Arc<crate::service::Service>) -> Self {
+        Self { service }
    }
 }

@@ -61,14 +50,7 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .re_attach(reattach_req)
-            .await
-            .map_err(ApiError::InternalServerError)?,
-    )
+    json_response(StatusCode::OK, state.service.re_attach(reattach_req))
 }

 /// Pageserver calls into this before doing deletions, to confirm that it still
@@ -86,14 +68,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
    let state = get_state(&req);

-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .attach_hook(attach_req)
-            .await
-            .map_err(ApiError::InternalServerError)?,
-    )
+    json_response(StatusCode::OK, state.service.attach_hook(attach_req))
 }

 async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -104,40 +79,40 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }

-async fn handle_tenant_create(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
-}
-
-async fn handle_tenant_timeline_create(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
+    let state = get_state(&req);
    json_response(
        StatusCode::OK,
-        service
+        state.service.tenant_create(create_req).await?,
+    )
+}
+
+async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
            .tenant_timeline_create(tenant_id, create_req)
            .await?,
    )
 }

-async fn handle_tenant_locate(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
    let state = get_state(&req);
-    state.service.node_register(register_req).await?;
+    state.service.node_register(register_req);
    json_response(StatusCode::OK, ())
 }

@@ -154,15 +129,28 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

-async fn handle_tenant_shard_migrate(
-    service: Arc<Service>,
-    mut req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
-    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+async fn handle_tenant_shard_split(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
+    let state = get_state(&req);
+
    json_response(
        StatusCode::OK,
-        service
+        state
+            .service
+            .tenant_shard_split(tenant_id, split_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
            .tenant_shard_migrate(tenant_shard_id, migrate_req)
            .await?,
    )
@@ -179,53 +167,9 @@ impl From<ReconcileError> for ApiError {
    }
 }

-/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
-/// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
-where
-    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
-{
-    let state = get_state(&request);
-    let service = state.service.clone();
-
-    let startup_complete = service.startup_complete.clone();
-    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
-        .await
-        .is_err()
-    {
-        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
-        // timeouts around its remote calls, to bound its runtime.
-        return Err(ApiError::Timeout(
-            "Timed out waiting for service readiness".into(),
-        ));
-    }
-
-    request_span(
-        request,
-        |request| async move { handler(service, request).await },
-    )
-    .await
-}
-
-pub fn make_router(
-    service: Arc<Service>,
-    auth: Option<Arc<SwappableJwtAuth>>,
-) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router();
-    if auth.is_some() {
-        router = router.middleware(auth_middleware(|request| {
-            let state = get_state(request);
-            if state.allowlist_routes.contains(request.uri()) {
-                None
-            } else {
-                state.auth.as_deref()
-            }
-        }))
-    }
-
-    router
-        .data(Arc::new(HttpState::new(service, auth)))
+pub fn make_router(service: Arc<Service>) -> RouterBuilder<hyper::Body, ApiError> {
+    endpoint::make_router()
+        .data(Arc::new(HttpState { service }))
        .get("/status", |r| request_span(r, handle_status))
        .post("/re-attach", |r| request_span(r, handle_re_attach))
        .post("/validate", |r| request_span(r, handle_validate))
@@ -235,20 +179,17 @@ pub fn make_router(
        .put("/node/:node_id/config", |r| {
            request_span(r, handle_node_configure)
        })
-        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create)
-        })
-        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(r, handle_tenant_timeline_create)
+        .post("/tenant", |r| request_span(r, handle_tenant_create))
+        .post("/tenant/:tenant_id/timeline", |r| {
+            request_span(r, handle_tenant_timeline_create)
        })
        .get("/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+            request_span(r, handle_tenant_locate)
+        })
+        .put("/tenant/:tenant_id/shard_split", |r| {
+            request_span(r, handle_tenant_shard_split)
        })
        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+            request_span(r, handle_tenant_shard_migrate)
        })
-        // Path aliases for tests_forward_compatibility
-        // TODO: remove these in future PR
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -1,36 +1,25 @@
-use serde::{Deserialize, Serialize};
 use utils::seqwait::MonotonicCounter;

 mod compute_hook;
 pub mod http;
 mod node;
-pub mod persistence;
 mod reconciler;
 mod scheduler;
-mod schema;
 pub mod service;
 mod tenant_state;

-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone)]
 enum PlacementPolicy {
    /// Cheapest way to attach a tenant: just one pageserver, no secondary
    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
-    /// Do not attach to any pageservers
-    Detached,
 }

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
 struct Sequence(u64);

-impl Sequence {
-    fn initial() -> Self {
-        Self(0)
-    }
-}
-
 impl std::fmt::Display for Sequence {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{}", self.0)
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -6,20 +6,13 @@
 ///
 use anyhow::anyhow;
 use attachment_service::http::make_router;
-use attachment_service::persistence::Persistence;
-use attachment_service::service::{Config, Service};
-use camino::Utf8PathBuf;
+use attachment_service::service::Service;
 use clap::Parser;
-use metrics::launch_timestamp::LaunchTimestamp;
-use std::sync::Arc;
-use tokio::signal::unix::SignalKind;
-use utils::auth::{JwtAuth, SwappableJwtAuth};
+use std::path::PathBuf;
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

-use utils::{project_build_tag, project_git_version, tcp_listener};
-
-project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);
+use utils::tcp_listener;

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -29,27 +22,13 @@ struct Cli {
    #[arg(short, long)]
    listen: std::net::SocketAddr,

-    /// Path to public key for JWT authentication of clients
-    #[arg(long)]
-    public_key: Option<camino::Utf8PathBuf>,
-
-    /// Token for authenticating this service with the pageservers it controls
-    #[arg(short, long)]
-    jwt_token: Option<String>,
-
    /// Path to the .json file to store state (will be created if it doesn't exist)
    #[arg(short, long)]
-    path: Utf8PathBuf,
-
-    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
-    #[arg(long)]
-    database_url: String,
+    path: PathBuf,
 }

 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
-
    logging::init(
        LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
@@ -58,63 +37,29 @@ async fn main() -> anyhow::Result<()> {

    let args = Cli::parse();
    tracing::info!(
-        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
-        GIT_VERSION,
-        launch_ts.to_string(),
-        BUILD_TAG,
-        args.path,
+        "Starting, state at {}, listening on {}",
+        args.path.to_string_lossy(),
        args.listen
    );

-    let config = Config {
-        jwt_token: args.jwt_token,
-    };
-
-    let json_path = if args.path.as_os_str().is_empty() {
-        None
-    } else {
-        Some(args.path)
-    };
-    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
-
-    let service = Service::spawn(config, persistence.clone()).await?;
+    let service = Service::spawn();

    let http_listener = tcp_listener::bind(args.listen)?;
-
-    let auth = if let Some(public_key_path) = &args.public_key {
-        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
-        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
-    } else {
-        None
-    };
-    let router = make_router(service, auth)
-        .build()
-        .map_err(|err| anyhow!(err))?;
-    let router_service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
+    let router = make_router(service).build().map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);

    tokio::task::spawn(server);

-    // Wait until we receive a signal
-    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
-    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
-    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
-    tokio::select! {
-        _ = sigint.recv() => {},
-        _ = sigterm.recv() => {},
-        _ = sigquit.recv() => {},
-    }
-    tracing::info!("Terminating on signal");
-
-    if json_path.is_some() {
-        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
-        // full postgres dumps around.
-        if let Err(e) = persistence.write_tenants_json().await {
-            tracing::error!("Failed to write JSON on shutdown: {e}")
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
        }
-    }
+    })?;

-    std::process::exit(0);
+    Ok(())
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,8 +1,6 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;

-use crate::persistence::NodePersistence;
-
 #[derive(Clone)]
 pub(crate) struct Node {
    pub(crate) id: NodeId,
@@ -19,7 +17,10 @@ pub(crate) struct Node {

 impl Node {
    pub(crate) fn base_url(&self) -> String {
-        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+        format!(
+            "http://{}:{}/v1",
+            self.listen_http_addr, self.listen_http_port
+        )
    }

    /// Is this node elegible to have work scheduled onto it?
@@ -36,15 +37,4 @@ impl Node {
            NodeSchedulingPolicy::Pause => false,
        }
    }
-
-    pub(crate) fn to_persistent(&self) -> NodePersistence {
-        NodePersistence {
-            node_id: self.id.0 as i64,
-            scheduling_policy: self.scheduling.into(),
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port as i32,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port as i32,
-        }
-    }
 }
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,439 +0,0 @@
-use std::collections::HashMap;
-use std::str::FromStr;
-
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
-use diesel::pg::PgConnection;
-use diesel::prelude::*;
-use diesel::Connection;
-use pageserver_api::models::TenantConfig;
-use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use postgres_connection::parse_host_port;
-use serde::{Deserialize, Serialize};
-use utils::generation::Generation;
-use utils::id::{NodeId, TenantId};
-
-use crate::node::Node;
-use crate::PlacementPolicy;
-
-/// ## What do we store?
-///
-/// The attachment service does not store most of its state durably.
-///
-/// The essential things to store durably are:
-/// - generation numbers, as these must always advance monotonically to ensure data safety.
-/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
-/// - Node's scheduling policies, as the source of truth for these is something external.
-///
-/// Other things we store durably as an implementation detail:
-/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
-///   but it is operationally simpler to make this service the authority for which nodes
-///   it talks to.
-///
-/// ## Performance/efficiency
-///
-/// The attachment service does not go via the database for most things: there are
-/// a couple of places where we must, and where efficiency matters:
-/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
-///   before it can attach a tenant, so this acts as a bound on how fast things like
-///   failover can happen.
-/// - Pageserver re-attach: we will increment many shards' generations when this happens,
-///   so it is important to avoid e.g. issuing O(N) queries.
-///
-/// Database calls relating to nodes have low performance requirements, as they are very rarely
-/// updated, and reads of nodes are always from memory, not the database.  We only require that
-/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
-pub struct Persistence {
-    database_url: String,
-
-    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
-    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
-    // compatible just yet.
-    json_path: Option<Utf8PathBuf>,
-}
-
-/// Legacy format, for use in JSON compat objects in test environment
-#[derive(Serialize, Deserialize)]
-struct JsonPersistence {
-    tenants: HashMap<TenantShardId, TenantShardPersistence>,
-}
-
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum DatabaseError {
-    #[error(transparent)]
-    Query(#[from] diesel::result::Error),
-    #[error(transparent)]
-    Connection(#[from] diesel::result::ConnectionError),
-    #[error("Logical error: {0}")]
-    Logical(String),
-}
-
-pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
-
-impl Persistence {
-    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
-        Self {
-            database_url,
-            json_path,
-        }
-    }
-
-    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
-    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
-    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let database_url = self.database_url.clone();
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
-            // TODO: connection pooling, such as via diesel::r2d2
-            let mut conn = PgConnection::establish(&database_url)?;
-            func(&mut conn)
-        })
-        .await
-        .expect("Task panic")
-    }
-
-    /// When a node is first registered, persist it before using it for anything
-    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
-        let np = node.to_persistent();
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::insert_into(crate::schema::nodes::table)
-                .values(&np)
-                .execute(conn)?;
-            Ok(())
-        })
-        .await
-    }
-
-    /// At startup, populate the list of nodes which our shards may be placed on
-    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
-        let nodes: Vec<Node> = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table
-                    .load::<NodePersistence>(conn)?
-                    .into_iter()
-                    .map(|n| Node {
-                        id: NodeId(n.node_id as u64),
-                        // At startup we consider a node offline until proven otherwise.
-                        availability: NodeAvailability::Offline,
-                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                            .expect("Bad scheduling policy in DB"),
-                        listen_http_addr: n.listen_http_addr,
-                        listen_http_port: n.listen_http_port as u16,
-                        listen_pg_addr: n.listen_pg_addr,
-                        listen_pg_port: n.listen_pg_port as u16,
-                    })
-                    .collect::<Vec<Node>>())
-            })
-            .await?;
-
-        if nodes.is_empty() {
-            return self.list_nodes_local_env().await;
-        }
-
-        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
-
-        Ok(nodes)
-    }
-
-    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
-    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
-        // Enable test_backward_compatibility to work by populating our list of
-        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
-        // first startup in the compat test, we may have shards but no nodes.
-        use control_plane::local_env::LocalEnv;
-        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-        tracing::info!(
-            "Loading {} pageserver nodes from LocalEnv",
-            env.pageservers.len()
-        );
-        let mut nodes = Vec::new();
-        for ps_conf in env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            let node = Node {
-                id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-            };
-
-            // Synchronize database with what we learn from LocalEnv
-            self.insert_node(&node).await?;
-
-            nodes.push(node);
-        }
-
-        Ok(nodes)
-    }
-
-    /// At startup, load the high level state for shards, such as their config + policy.  This will
-    /// be enriched at runtime with state discovered on pageservers.
-    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let loaded = self
-            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-            })
-            .await?;
-
-        if loaded.is_empty() {
-            if let Some(path) = &self.json_path {
-                if tokio::fs::try_exists(path)
-                    .await
-                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
-                {
-                    tracing::info!("Importing from legacy JSON format at {path}");
-                    return self.list_tenant_shards_json(path).await;
-                }
-            }
-        }
-        Ok(loaded)
-    }
-
-    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
-    pub(crate) async fn list_tenant_shards_json(
-        &self,
-        path: &Utf8Path,
-    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
-        let bytes = tokio::fs::read(path)
-            .await
-            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
-
-        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
-            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = tenant_id.to_string();
-                tenant.config = serde_json::to_string(&TenantConfig::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
-                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-            }
-        }
-
-        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
-
-        // Synchronize database with what is in the JSON file
-        self.insert_tenant_shards(tenants.clone()).await?;
-
-        Ok(tenants)
-    }
-
-    /// For use in testing environments, where we dump out JSON on shutdown.
-    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
-        let Some(path) = &self.json_path else {
-            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
-        };
-        tracing::info!("Writing state to {path}...");
-        let tenants = self.list_tenant_shards().await?;
-        let mut tenants_map = HashMap::new();
-        for tsp in tenants {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-
-            tenants_map.insert(tenant_shard_id, tsp);
-        }
-        let json = serde_json::to_string(&JsonPersistence {
-            tenants: tenants_map,
-        })?;
-
-        tokio::fs::write(path, &json).await?;
-        tracing::info!("Wrote {} bytes to {path}...", json.len());
-
-        Ok(())
-    }
-
-    /// Tenants must be persisted before we schedule them for the first time.  This enables us
-    /// to correctly retain generation monotonicity, and the externally provided placement policy & config.
-    pub(crate) async fn insert_tenant_shards(
-        &self,
-        shards: Vec<TenantShardPersistence>,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> QueryResult<()> {
-                for tenant in &shards {
-                    diesel::insert_into(tenant_shards)
-                        .values(tenant)
-                        .execute(conn)?;
-                }
-                Ok(())
-            })?;
-            Ok(())
-        })
-        .await
-    }
-
-    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
-    /// the tenant from memory on this server.
-    #[allow(unused)]
-    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| -> DatabaseResult<()> {
-            diesel::delete(tenant_shards)
-                .filter(tenant_id.eq(del_tenant_id.to_string()))
-                .execute(conn)?;
-
-            Ok(())
-        })
-        .await
-    }
-
-    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
-    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
-    /// the node that called /re-attach.
-    #[tracing::instrument(skip_all, fields(node_id))]
-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
-        use crate::schema::tenant_shards::dsl::*;
-        let updated = self
-            .with_conn(move |conn| {
-                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
-                    .set(generation.eq(generation + 1))
-                    .execute(conn)?;
-
-                tracing::info!("Incremented {} tenants' generations", rows_updated);
-
-                // TODO: UPDATE+SELECT in one query
-
-                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
-                    .select(TenantShardPersistence::as_select())
-                    .load(conn)?;
-                Ok(updated)
-            })
-            .await?;
-
-        let mut result = HashMap::new();
-        for tsp in updated {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
-                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
-        }
-
-        Ok(result)
-    }
-
-    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
-    /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
-    /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
-    pub(crate) async fn increment_generation(
-        &self,
-        tenant_shard_id: TenantShardId,
-        node_id: NodeId,
-    ) -> anyhow::Result<Generation> {
-        use crate::schema::tenant_shards::dsl::*;
-        let updated = self
-            .with_conn(move |conn| {
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
-                    .set((
-                        generation.eq(generation + 1),
-                        generation_pageserver.eq(node_id.0 as i64),
-                    ))
-                    // TODO: only returning() the generation column
-                    .returning(TenantShardPersistence::as_returning())
-                    .get_result(conn)?;
-
-                Ok(updated)
-            })
-            .await?;
-
-        Ok(Generation::new(updated.generation as u32))
-    }
-
-    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_conn(move |conn| {
-            let updated = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
-                .set((
-                    generation_pageserver.eq(i64::MAX),
-                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
-                ))
-                .execute(conn)?;
-
-            Ok(updated)
-        })
-        .await?;
-
-        Ok(())
-    }
-
-    // TODO: when we start shard splitting, we must durably mark the tenant so that
-    // on restart, we know that we must go through recovery (list shards that exist
-    // and pick up where we left off and/or revert to parent shards).
-    #[allow(dead_code)]
-    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
-    }
-
-    // TODO: when we finish shard splitting, we must atomically clean up the old shards
-    // and insert the new shards, and clear the splitting marker.
-    #[allow(dead_code)]
-    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
-        todo!();
-    }
-}
-
-/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
-#[diesel(table_name = crate::schema::tenant_shards)]
-pub(crate) struct TenantShardPersistence {
-    #[serde(default)]
-    pub(crate) tenant_id: String,
-    #[serde(default)]
-    pub(crate) shard_number: i32,
-    #[serde(default)]
-    pub(crate) shard_count: i32,
-    #[serde(default)]
-    pub(crate) shard_stripe_size: i32,
-
-    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    pub(crate) generation: i32,
-
-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: i64,
-
-    #[serde(default)]
-    pub(crate) placement_policy: String,
-    #[serde(default)]
-    pub(crate) config: String,
-}
-
-/// Parts of [`crate::node::Node`] that are stored durably
-#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
-#[diesel(table_name = crate::schema::nodes)]
-pub(crate) struct NodePersistence {
-    pub(crate) node_id: i64,
-    pub(crate) scheduling_policy: String,
-    pub(crate) listen_http_addr: String,
-    pub(crate) listen_http_port: i32,
-    pub(crate) listen_pg_addr: String,
-    pub(crate) listen_pg_port: i32,
-}
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,11 +1,13 @@
-use crate::persistence::Persistence;
-use crate::service;
 use control_plane::attachment_service::NodeAvailability;
+use control_plane::local_env::LocalEnv;
+use control_plane::pageserver::PageServerNode;
+use hyper::Method;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+    TenantLocationConfigRequest,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
-use pageserver_client::mgmt_api;
+use reqwest::Client;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
@@ -30,8 +32,6 @@ pub(super) struct Reconciler {
    pub(crate) config: TenantConfig,
    pub(crate) observed: ObservedState,

-    pub(crate) service_config: service::Config,
-
    /// A snapshot of the pageservers as they were when we were asked
    /// to reconcile.
    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
@@ -46,9 +46,6 @@ pub(super) struct Reconciler {
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
    pub(crate) cancel: CancellationToken,
-
-    /// Access to persistent storage for updating generation numbers
-    pub(crate) persistence: Arc<Persistence>,
 }

 #[derive(thiserror::Error, Debug)]
@@ -62,7 +59,6 @@ impl Reconciler {
        &mut self,
        node_id: NodeId,
        config: LocationConfig,
-        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
        let node = self
            .pageservers
@@ -73,18 +69,31 @@ impl Reconciler {
            .locations
            .insert(node.id, ObservedStateLocation { conf: None });

-        tracing::info!("location_config({}) calling: {:?}", node_id, config);
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-        client
-            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+        let configure_request = TenantLocationConfigRequest {
+            tenant_id: self.tenant_shard_id,
+            config: config.clone(),
+        };
+
+        let client = Client::new();
+        let response = client
+            .request(
+                Method::PUT,
+                format!(
+                    "{}/tenant/{}/location_config",
+                    node.base_url(),
+                    self.tenant_shard_id
+                ),
+            )
+            .json(&configure_request)
+            .send()
            .await?;
-        tracing::info!("location_config({}) complete: {:?}", node_id, config);

        self.observed
            .locations
            .insert(node.id, ObservedStateLocation { conf: Some(config) });

+        response.error_for_status()?;
+
        Ok(())
    }

@@ -143,104 +152,35 @@ impl Reconciler {
        };

        // We have an origin and a destination: proceed to do the live migration
-        tracing::info!("Live migrating {}->{}", origin, destination);
-        self.live_migrate(origin, destination).await?;
+        let env = LocalEnv::load_config().expect("Error loading config");
+        let origin_ps = PageServerNode::from_env(
+            &env,
+            env.get_pageserver_conf(origin)
+                .expect("Conf missing pageserver"),
+        );
+        let destination_ps = PageServerNode::from_env(
+            &env,
+            env.get_pageserver_conf(destination)
+                .expect("Conf missing pageserver"),
+        );

-        Ok(())
-    }
-
-    async fn get_lsns(
-        &self,
-        tenant_shard_id: TenantShardId,
-        node_id: &NodeId,
-    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-
-        let timelines = client.timeline_list(&tenant_shard_id).await?;
-        Ok(timelines
-            .into_iter()
-            .map(|t| (t.timeline_id, t.last_record_lsn))
-            .collect())
-    }
-
-    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
-        let node = self
-            .pageservers
-            .get(node_id)
-            .expect("Pageserver may not be removed while referenced");
-
-        let client =
-            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
-
-        match client.tenant_secondary_download(tenant_shard_id).await {
-            Ok(()) => {}
-            Err(_) => {
-                tracing::info!("  (skipping, destination wasn't in secondary mode)")
-            }
-        }
-    }
-
-    async fn await_lsn(
-        &self,
-        tenant_shard_id: TenantShardId,
-        pageserver_id: &NodeId,
-        baseline: HashMap<TimelineId, Lsn>,
-    ) -> anyhow::Result<()> {
-        loop {
-            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
-                Ok(l) => l,
-                Err(e) => {
-                    println!(
-                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
-                        pageserver_id
-                    );
-                    std::thread::sleep(Duration::from_millis(500));
-                    continue;
-                }
-            };
-
-            let mut any_behind: bool = false;
-            for (timeline_id, baseline_lsn) in &baseline {
-                match latest.get(timeline_id) {
-                    Some(latest_lsn) => {
-                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
-                        if latest_lsn < baseline_lsn {
-                            any_behind = true;
-                        }
-                    }
-                    None => {
-                        // Expected timeline isn't yet visible on migration destination.
-                        // (IRL we would have to account for timeline deletion, but this
-                        //  is just test helper)
-                        any_behind = true;
-                    }
-                }
-            }
-
-            if !any_behind {
-                println!("✅ LSN caught up.  Proceeding...");
-                break;
-            } else {
-                std::thread::sleep(Duration::from_millis(500));
-            }
-        }
+        tracing::info!(
+            "Live migrating {}->{}",
+            origin_ps.conf.id,
+            destination_ps.conf.id
+        );
+        self.live_migrate(origin_ps, destination_ps).await?;

        Ok(())
    }

    pub async fn live_migrate(
        &mut self,
-        origin_ps_id: NodeId,
-        dest_ps_id: NodeId,
+        origin_ps: PageServerNode,
+        dest_ps: PageServerNode,
    ) -> anyhow::Result<()> {
        // `maybe_live_migrate` is responsibble for sanity of inputs
-        assert!(origin_ps_id != dest_ps_id);
+        assert!(origin_ps.conf.id != dest_ps.conf.id);

        fn build_location_config(
            shard: &ShardIdentity,
@@ -260,9 +200,67 @@ impl Reconciler {
            }
        }

+        async fn get_lsns(
+            tenant_shard_id: TenantShardId,
+            pageserver: &PageServerNode,
+        ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
+            Ok(timelines
+                .into_iter()
+                .map(|t| (t.timeline_id, t.last_record_lsn))
+                .collect())
+        }
+
+        async fn await_lsn(
+            tenant_shard_id: TenantShardId,
+            pageserver: &PageServerNode,
+            baseline: HashMap<TimelineId, Lsn>,
+        ) -> anyhow::Result<()> {
+            loop {
+                let latest = match get_lsns(tenant_shard_id, pageserver).await {
+                    Ok(l) => l,
+                    Err(e) => {
+                        println!(
+                            "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                            pageserver.conf.id
+                        );
+                        std::thread::sleep(Duration::from_millis(500));
+                        continue;
+                    }
+                };
+
+                let mut any_behind: bool = false;
+                for (timeline_id, baseline_lsn) in &baseline {
+                    match latest.get(timeline_id) {
+                        Some(latest_lsn) => {
+                            println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                            if latest_lsn < baseline_lsn {
+                                any_behind = true;
+                            }
+                        }
+                        None => {
+                            // Expected timeline isn't yet visible on migration destination.
+                            // (IRL we would have to account for timeline deletion, but this
+                            //  is just test helper)
+                            any_behind = true;
+                        }
+                    }
+                }
+
+                if !any_behind {
+                    println!("✅ LSN caught up.  Proceeding...");
+                    break;
+                } else {
+                    std::thread::sleep(Duration::from_millis(500));
+                }
+            }
+
+            Ok(())
+        }
+
        tracing::info!(
            "🔁 Switching origin pageserver {} to stale mode",
-            origin_ps_id
+            origin_ps.conf.id
        );

        // FIXME: it is incorrect to use self.generation here, we should use the generation
@@ -274,30 +272,18 @@ impl Reconciler {
            Some(self.generation),
            None,
        );
-        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
+        origin_ps
+            .location_config(
+                self.tenant_shard_id,
+                stale_conf,
+                Some(Duration::from_secs(10)),
+            )
            .await?;

-        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
-
-        // If we are migrating to a destination that has a secondary location, warm it up first
-        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
-            if let Some(destination_conf) = &destination_conf.conf {
-                if destination_conf.mode == LocationConfigMode::Secondary {
-                    tracing::info!(
-                        "🔁 Downloading latest layers to destination pageserver {}",
-                        dest_ps_id,
-                    );
-                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
-                        .await;
-                }
-            }
-        }
+        let baseline_lsns = Some(get_lsns(self.tenant_shard_id, &origin_ps).await?);

        // Increment generation before attaching to new pageserver
-        self.generation = self
-            .persistence
-            .increment_generation(self.tenant_shard_id, dest_ps_id)
-            .await?;
+        self.generation = self.generation.next();

        let dest_conf = build_location_config(
            &self.shard,
@@ -307,18 +293,19 @@ impl Reconciler {
            None,
        );

-        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
-        self.location_config(dest_ps_id, dest_conf, None).await?;
+        tracing::info!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+        dest_ps
+            .location_config(self.tenant_shard_id, dest_conf, None)
+            .await?;

        if let Some(baseline) = baseline_lsns {
            tracing::info!("🕑 Waiting for LSN to catch up...");
-            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
-                .await?;
+            await_lsn(self.tenant_shard_id, &dest_ps, baseline).await?;
        }

-        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps.conf.id);
        self.compute_hook
-            .notify(self.tenant_shard_id, dest_ps_id)
+            .notify(self.tenant_shard_id, dest_ps.conf.id)
            .await?;

        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
@@ -330,13 +317,14 @@ impl Reconciler {
            None,
            Some(LocationConfigSecondary { warm: true }),
        );
-        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+        origin_ps
+            .location_config(self.tenant_shard_id, origin_secondary_conf.clone(), None)
            .await?;
        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
        // the observed state to None, then runs, then sets it to what we wrote.
        self.observed.locations.insert(
-            origin_ps_id,
+            origin_ps.conf.id,
            ObservedStateLocation {
                conf: Some(origin_secondary_conf),
            },
@@ -344,7 +332,7 @@ impl Reconciler {

        println!(
            "🔁 Switching to AttachedSingle mode on pageserver {}",
-            dest_ps_id
+            dest_ps.conf.id
        );
        let dest_final_conf = build_location_config(
            &self.shard,
@@ -353,10 +341,11 @@ impl Reconciler {
            Some(self.generation),
            None,
        );
-        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+        dest_ps
+            .location_config(self.tenant_shard_id, dest_final_conf.clone(), None)
            .await?;
        self.observed.locations.insert(
-            dest_ps_id,
+            dest_ps.conf.id,
            ObservedStateLocation {
                conf: Some(dest_final_conf),
            },
@@ -389,17 +378,12 @@ impl Reconciler {
                    // Nothing to do
                    tracing::info!("Observed configuration already correct.")
                }
-                _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.  This includes locations with different configurations, as well
-                    // as locations with unknown (None) observed state.
-                    self.generation = self
-                        .persistence
-                        .increment_generation(self.tenant_shard_id, node_id)
-                        .await?;
+                Some(_) | None => {
+                    // If there is no observed configuration, or if its value does not equal our intent, then we must call out to the pageserver.
+                    self.generation = self.generation.next();
                    wanted_conf.generation = self.generation.into();
                    tracing::info!("Observed configuration requires update.");
-                    self.location_config(node_id, wanted_conf, None).await?;
+                    self.location_config(node_id, wanted_conf).await?;
                    if let Err(e) = self
                        .compute_hook
                        .notify(self.tenant_shard_id, node_id)
@@ -423,9 +407,8 @@ impl Reconciler {
                    // Nothing to do
                    tracing::info!(%node_id, "Observed configuration already correct.")
                }
-                _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.
+                Some(_) | None => {
+                    // If there is no observed configuration, or if its value does not equal our intent, then we must call out to the pageserver.
                    tracing::info!(%node_id, "Observed configuration requires update.");
                    changes.push((*node_id, wanted_conf))
                }
@@ -456,7 +439,7 @@ impl Reconciler {
        }

        for (node_id, conf) in changes {
-            self.location_config(node_id, conf, None).await?;
+            self.location_config(node_id, conf).await?;
        }

        Ok(())
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -1,27 +0,0 @@
-// @generated automatically by Diesel CLI.
-
-diesel::table! {
-    nodes (node_id) {
-        node_id -> Int8,
-        scheduling_policy -> Varchar,
-        listen_http_addr -> Varchar,
-        listen_http_port -> Int4,
-        listen_pg_addr -> Varchar,
-        listen_pg_port -> Int4,
-    }
-}
-
-diesel::table! {
-    tenant_shards (tenant_id, shard_number, shard_count) {
-        tenant_id -> Varchar,
-        shard_number -> Int4,
-        shard_count -> Int4,
-        shard_stripe_size -> Int4,
-        generation -> Int4,
-        generation_pageserver -> Int8,
-        placement_policy -> Varchar,
-        config -> Text,
-    }
-}
-
-diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};

 use control_plane::attachment_service::NodeAvailability;
 use pageserver_api::{
-    models::{LocationConfig, LocationConfigMode, TenantConfig},
+    models::{LocationConfig, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
 };
 use tokio::task::JoinHandle;
@@ -16,10 +16,9 @@ use utils::{
 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::Persistence,
    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
    scheduler::{ScheduleError, Scheduler},
-    service, PlacementPolicy, Sequence,
+    PlacementPolicy, Sequence,
 };

 pub(crate) struct TenantState {
@@ -27,9 +26,6 @@ pub(crate) struct TenantState {

    pub(crate) shard: ShardIdentity,

-    // Runtime only: sequence used to coordinate when updating this object while
-    // with background reconcilers may be running.  A reconciler runs to a particular
-    // sequence.
    pub(crate) sequence: Sequence,

    // Latest generation number: next time we attach, increment this
@@ -49,8 +45,6 @@ pub(crate) struct TenantState {
    // with `Self::reconcile`.
    pub(crate) observed: ObservedState,

-    // Tenant configuration, passed through opaquely to the pageserver.  Identical
-    // for all shards in a tenant.
    pub(crate) config: TenantConfig,

    /// If a reconcile task is currently in flight, it may be joined here (it is
@@ -61,16 +55,6 @@ pub(crate) struct TenantState {
    /// Optionally wait for reconciliation to complete up to a particular
    /// sequence number.
    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-
-    /// Indicates sequence number for which we have encountered an error reconciling.  If
-    /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
-    /// and callers should stop waiting for `waiter` and propagate the error.
-    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-
-    /// The most recent error from a reconcile on this tenant
-    /// TODO: generalize to an array of recent events
-    /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
 }

 #[derive(Default, Clone, Debug)]
@@ -105,41 +89,12 @@ pub(crate) struct ReconcilerWaiter {
    pub(crate) tenant_shard_id: TenantShardId,

    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<String>>,
    seq: Sequence,
 }

-#[derive(thiserror::Error, Debug)]
-pub enum ReconcileWaitError {
-    #[error("Timeout waiting for shard {0}")]
-    Timeout(TenantShardId),
-    #[error("shutting down")]
-    Shutdown,
-    #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, String),
-}
-
 impl ReconcilerWaiter {
-    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
-        tokio::select! {
-            result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
-                result.map_err(|e| match e {
-                    SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
-                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
-                })?;
-            },
-            result = self.error_seq_wait.wait_for(self.seq) => {
-                result.map_err(|e| match e {
-                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
-                    SeqWaitError::Timeout => unreachable!()
-                })?;
-
-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
-            }
-        }
-
-        Ok(())
+    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), SeqWaitError> {
+        self.seq_wait.wait_for_timeout(self.seq, timeout).await
    }
 }

@@ -167,12 +122,6 @@ pub(crate) struct ReconcileResult {
 }

 impl IntentState {
-    pub(crate) fn new() -> Self {
-        Self {
-            attached: None,
-            secondary: vec![],
-        }
-    }
    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
        let mut result = Vec::new();
        if let Some(p) = self.attached {
@@ -184,6 +133,13 @@ impl IntentState {
        result
    }

+    pub(crate) fn single(node_id: Option<NodeId>) -> Self {
+        Self {
+            attached: node_id,
+            secondary: vec![],
+        }
+    }
+
    /// When a node goes offline, we update intents to avoid using it
    /// as their attached pageserver.
    ///
@@ -199,14 +155,6 @@ impl IntentState {
    }
 }

-impl ObservedState {
-    pub(crate) fn new() -> Self {
-        Self {
-            locations: HashMap::new(),
-        }
-    }
-}
-
 impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
@@ -224,53 +172,9 @@ impl TenantState {
            reconciler: None,
            sequence: Sequence(1),
            waiter: Arc::new(SeqWait::new(Sequence(0))),
-            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
-            last_error: Arc::default(),
        }
    }

-    /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
-    /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
-    /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
-    /// in a way that makes use of any configured locations that already exist in the outside world.
-    pub(crate) fn intent_from_observed(&mut self) {
-        // Choose an attached location by filtering observed locations, and then sorting to get the highest
-        // generation
-        let mut attached_locs = self
-            .observed
-            .locations
-            .iter()
-            .filter_map(|(node_id, l)| {
-                if let Some(conf) = &l.conf {
-                    if conf.mode == LocationConfigMode::AttachedMulti
-                        || conf.mode == LocationConfigMode::AttachedSingle
-                        || conf.mode == LocationConfigMode::AttachedStale
-                    {
-                        Some((node_id, conf.generation))
-                    } else {
-                        None
-                    }
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-
-        attached_locs.sort_by_key(|i| i.1);
-        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
-            self.intent.attached = Some(*node_id);
-        }
-
-        // All remaining observed locations generate secondary intents.  This includes None
-        // observations, as these may well have some local content on disk that is usable (this
-        // is an edge case that might occur if we restarted during a migration or other change)
-        self.observed.locations.keys().for_each(|node_id| {
-            if Some(*node_id) != self.intent.attached {
-                self.intent.secondary.push(*node_id);
-            }
-        });
-    }
-
    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
@@ -312,18 +216,6 @@ impl TenantState {
                    modified = true;
                }
            }
-            Detached => {
-                // Should have no attached or secondary pageservers
-                if self.intent.attached.is_some() {
-                    self.intent.attached = None;
-                    modified = true;
-                }
-
-                if !self.intent.secondary.is_empty() {
-                    self.intent.secondary.clear();
-                    modified = true;
-                }
-            }
        }

        if modified {
@@ -362,8 +254,6 @@ impl TenantState {
        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
        pageservers: &Arc<HashMap<NodeId, Node>>,
        compute_hook: &Arc<ComputeHook>,
-        service_config: &service::Config,
-        persistence: &Arc<Persistence>,
    ) -> Option<ReconcilerWaiter> {
        // If there are any ambiguous observed states, and the nodes they refer to are available,
        // we should reconcile to clean them up.
@@ -391,8 +281,6 @@ impl TenantState {
                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
-                    error_seq_wait: self.error_waiter.clone(),
-                    error: self.last_error.clone(),
                    seq: self.sequence,
                });
            }
@@ -412,9 +300,7 @@ impl TenantState {
            observed: self.observed.clone(),
            pageservers: pageservers.clone(),
            compute_hook: compute_hook.clone(),
-            service_config: service_config.clone(),
            cancel: cancel.clone(),
-            persistence: persistence.clone(),
        };

        let reconcile_seq = self.sequence;
@@ -459,8 +345,6 @@ impl TenantState {
        Some(ReconcilerWaiter {
            tenant_shard_id: self.tenant_shard_id,
            seq_wait: self.waiter.clone(),
-            error_seq_wait: self.error_waiter.clone(),
-            error: self.last_error.clone(),
            seq: self.sequence,
        })
    }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,41 +1,29 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::{Utf8Path, Utf8PathBuf};
-use diesel::{
-    backend::Backend,
-    query_builder::{AstPass, QueryFragment, QueryId},
-    Connection, PgConnection, QueryResult, RunQueryDsl,
-};
-use diesel_migrations::{HarnessWithOutput, MigrationHarness};
-use hyper::Method;
+use anyhow::anyhow;
+use camino::Utf8PathBuf;
+use hyper::{Method, StatusCode};
 use pageserver_api::{
-    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    models::{
+        ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
+        TimelineCreateRequest, TimelineInfo,
+    },
    shard::TenantShardId,
 };
-use pageserver_client::mgmt_api::ResponseErrorMessageExt;
-use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{env, str::FromStr};
-use tokio::process::Command;
+use std::{path::PathBuf, process::Child, str::FromStr};
 use tracing::instrument;
-use utils::{
-    auth::{Claims, Scope},
-    id::{NodeId, TenantId},
-};
+use utils::id::{NodeId, TenantId};

 pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
-    path: Utf8PathBuf,
-    jwt_token: Option<String>,
-    public_key_path: Option<Utf8PathBuf>,
-    postgres_port: u16,
+    path: PathBuf,
    client: reqwest::Client,
 }

 const COMMAND: &str = "attachment_service";

-const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,13 +124,18 @@ impl FromStr for NodeAvailability {
    }
 }

-/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
-/// type needs to be defined with diesel traits in there.
 #[derive(Serialize, Deserialize, Clone, Copy)]
 pub enum NodeSchedulingPolicy {
+    // Normal, happy state
    Active,
+
+    // A newly added node: gradually move some work here.
    Filling,
+
+    // Do not schedule new work here, but leave configured locations in place.
    Pause,
+
+    // Do not schedule work here.  Gracefully move work away, as resources allow.
    Draining,
 }

@@ -160,27 +153,12 @@ impl FromStr for NodeSchedulingPolicy {
    }
 }

-impl From<NodeSchedulingPolicy> for String {
-    fn from(value: NodeSchedulingPolicy) -> String {
-        use NodeSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Filling => "filling",
-            Pause => "pause",
-            Draining => "draining",
-        }
-        .to_string()
-    }
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
-        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
-            .unwrap()
-            .join("attachments.json");
+        let path = env.base_data_dir.join("attachments.json");

        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
@@ -192,42 +170,10 @@ impl AttachmentService {
            listen_url.port().unwrap()
        );

-        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
-        // port, for use by our captive postgres.
-        let postgres_port = listen_url
-            .port()
-            .expect("Control plane API setting should always have a port")
-            + 1;
-
-        // Assume all pageservers have symmetric auth configuration: this service
-        // expects to use one JWT token to talk to all of them.
-        let ps_conf = env
-            .pageservers
-            .first()
-            .expect("Config is validated to contain at least one pageserver");
-        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
-            AuthType::Trust => (None, None),
-            AuthType::NeonJWT => {
-                let jwt_token = env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
-
-                // If pageserver auth is enabled, this implicitly enables auth for this service,
-                // using the same credentials.
-                let public_key_path =
-                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
-                        .unwrap();
-                (Some(jwt_token), Some(public_key_path))
-            }
-        };
-
        Self {
            env: env.clone(),
            path,
            listen,
-            jwt_token,
-            public_key_path,
-            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
@@ -239,231 +185,15 @@ impl AttachmentService {
            .expect("non-Unicode path")
    }

-    /// PIDFile for the postgres instance used to store attachment service state
-    fn postgres_pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.env
-                .base_data_dir
-                .join("attachment_service_postgres.pid"),
-        )
-        .expect("non-Unicode path")
-    }
+    pub async fn start(&self) -> anyhow::Result<Child> {
+        let path_str = self.path.to_string_lossy();

-    /// In order to access database migrations, we need to find the Neon source tree
-    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
-        // We assume that either prd or our binary is in the source tree. The former is usually
-        // true for automated test runners, the latter is usually true for developer workstations. Often
-        // both are true, which is fine.
-        let candidate_start_points = [
-            // Current working directory
-            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
-            // Directory containing the binary we're running inside
-            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
-        ];
-
-        // For each candidate start point, search through ancestors looking for a neon.git source tree root
-        for start_point in &candidate_start_points {
-            // Start from the build dir: assumes we are running out of a built neon source tree
-            for path in start_point.ancestors() {
-                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
-                // subdirectory.
-                let control_plane = path.join("control_plane");
-                if tokio::fs::try_exists(&control_plane).await? {
-                    return Ok(path.to_owned());
-                }
-            }
-        }
-
-        // Fall-through
-        Err(anyhow::anyhow!(
-            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
-        ))
-    }
-
-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
-    ///
-    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
-    /// to other versions if that one isn't found.  Some automated tests create circumstances
-    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
-
-        for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
-            if tokio::fs::try_exists(&path).await? {
-                return Ok(path);
-            }
-        }
-
-        // Fall through
-        anyhow::bail!(
-            "Postgres binaries not found in {}",
-            self.env.pg_distrib_dir.display()
-        );
-    }
-
-    /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
-        let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
-        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
-
-        Ok(exitcode.success())
-    }
-
-    /// Create our database if it doesn't exist, and run migrations.
-    ///
-    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
-    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
-    /// who just want to run `cargo neon_local` without knowing about diesel.
-    ///
-    /// Returns the database url
-    pub async fn setup_database(&self) -> anyhow::Result<String> {
-        let database_url = format!(
-            "postgresql://localhost:{}/attachment_service",
-            self.postgres_port
-        );
-        println!("Running attachment service database setup...");
-        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
-            let base = ::url::Url::parse(database_url).unwrap();
-            let database = base.path_segments().unwrap().last().unwrap().to_owned();
-            let mut new_url = base.join(default_database).unwrap();
-            new_url.set_query(base.query());
-            (database, new_url.into())
-        }
-
-        #[derive(Debug, Clone)]
-        pub struct CreateDatabaseStatement {
-            db_name: String,
-        }
-
-        impl CreateDatabaseStatement {
-            pub fn new(db_name: &str) -> Self {
-                CreateDatabaseStatement {
-                    db_name: db_name.to_owned(),
-                }
-            }
-        }
-
-        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
-            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
-                out.push_sql("CREATE DATABASE ");
-                out.push_identifier(&self.db_name)?;
-                Ok(())
-            }
-        }
-
-        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
-
-        impl QueryId for CreateDatabaseStatement {
-            type QueryId = ();
-
-            const HAS_STATIC_QUERY_ID: bool = false;
-        }
-        if PgConnection::establish(&database_url).is_err() {
-            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
-            println!("Creating database: {database}");
-            let mut conn = PgConnection::establish(&postgres_url)?;
-            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
-        }
-        let mut conn = PgConnection::establish(&database_url)?;
-
-        let migrations_dir = self
-            .find_source_root()
-            .await?
-            .join("control_plane/attachment_service/migrations");
-
-        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
-        println!("Running migrations in {}", migrations.path().display());
-        HarnessWithOutput::write_to_stdout(&mut conn)
-            .run_pending_migrations(migrations)
-            .map(|_| ())
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        println!("Migrations complete");
-
-        Ok(database_url)
-    }
-
-    pub async fn start(&self) -> anyhow::Result<()> {
-        // Start a vanilla Postgres process used by the attachment service for persistence.
-        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-            .unwrap()
-            .join("attachment_service_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-        let pg_log_path = pg_data_path.join("postgres.log");
-
-        if !tokio::fs::try_exists(&pg_data_path).await? {
-            // Initialize empty database
-            let initdb_path = pg_bin_dir.join("initdb");
-            let mut child = Command::new(&initdb_path)
-                .args(["-D", pg_data_path.as_ref()])
-                .spawn()
-                .expect("Failed to spawn initdb");
-            let status = child.wait().await?;
-            if !status.success() {
-                anyhow::bail!("initdb failed with status {status}");
-            }
-
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
-            )
-            .await?;
-        };
-
-        println!("Starting attachment service database...");
-        let db_start_args = [
-            "-w",
-            "-D",
-            pg_data_path.as_ref(),
-            "-l",
-            pg_log_path.as_ref(),
-            "start",
-        ];
-
-        background_process::start_process(
-            "attachment_service_db",
-            &self.env.base_data_dir,
-            pg_bin_dir.join("pg_ctl").as_std_path(),
-            db_start_args,
-            [],
-            background_process::InitialPidFile::Create(self.postgres_pid_file()),
-            || self.pg_isready(&pg_bin_dir),
-        )
-        .await?;
-
-        // Run migrations on every startup, in case something changed.
-        let database_url = self.setup_database().await?;
-
-        let mut args = vec![
-            "-l",
-            &self.listen,
-            "-p",
-            self.path.as_ref(),
-            "--database-url",
-            &database_url,
-        ]
-        .into_iter()
-        .map(|s| s.to_string())
-        .collect::<Vec<_>>();
-        if let Some(jwt_token) = &self.jwt_token {
-            args.push(format!("--jwt-token={jwt_token}"));
-        }
-
-        if let Some(public_key_path) = &self.public_key_path {
-            args.push(format!("--public-key={public_key_path}"));
-        }
-
-        background_process::start_process(
+        let result = background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.attachment_service_bin(),
-            args,
-            [(
-                "NEON_REPO_DIR".to_string(),
-                self.env.base_data_dir.to_string_lossy().to_string(),
-            )],
+            ["-l", &self.listen, "-p", &path_str],
+            [],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
                match self.status().await {
@@ -472,44 +202,28 @@ impl AttachmentService {
                }
            },
        )
-        .await?;
+        .await;

-        Ok(())
-    }
-
-    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
-
-        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-
-        println!("Stopping attachment service database...");
-        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
-        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_stop_args)
-            .spawn()?
-            .wait()
+        for ps_conf in &self.env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            self.node_register(NodeRegisterRequest {
+                node_id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+            })
            .await?;
-        if !stop_status.success() {
-            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-                .args(pg_status_args)
-                .spawn()?
-                .wait()
-                .await?;
-
-            // pg_ctl status returns this exit code if postgres is not running: in this case it is
-            // fine that stop failed.  Otherwise it is an error that stop failed.
-            const PG_STATUS_NOT_RUNNING: i32 = 3;
-            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
-                println!("Attachment service data base is already stopped");
-                return Ok(());
-            } else {
-                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
-            }
        }

-        Ok(())
+        result
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())
    }

    /// Simple HTTP request wrapper for calling into attachment service
@@ -535,20 +249,17 @@ impl AttachmentService {
        if let Some(body) = body {
            builder = builder.json(&body)
        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }

        let response = builder.send().await?;
-        let response = response.error_from_body().await?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!(
+                "Unexpected status {} on {}",
+                response.status(),
+                path
+            ));
+        }

-        Ok(response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+        Ok(response.json().await?)
    }

    /// Call into the attach_hook API, for use before handing out attachments to pageservers
@@ -558,19 +269,25 @@ impl AttachmentService {
        tenant_shard_id: TenantShardId,
        pageserver_id: NodeId,
    ) -> anyhow::Result<Option<u32>> {
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("attach-hook")
+            .unwrap();
+
        let request = AttachHookRequest {
            tenant_shard_id,
            node_id: Some(pageserver_id),
        };

-        let response = self
-            .dispatch::<_, AttachHookResponse>(
-                Method::POST,
-                "attach-hook".to_string(),
-                Some(request),
-            )
-            .await?;
+        let response = self.client.post(url).json(&request).send().await?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }

+        let response = response.json::<AttachHookResponse>().await?;
        Ok(response.gen)
    }

@@ -579,12 +296,22 @@ impl AttachmentService {
        &self,
        tenant_shard_id: TenantShardId,
    ) -> anyhow::Result<Option<(u32, NodeId)>> {
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
        let request = InspectRequest { tenant_shard_id };

-        let response = self
-            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
-            .await?;
+        let response = self.client.post(url).json(&request).send().await?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }

+        let response = response.json::<InspectResponse>().await?;
        Ok(response.attachment)
    }

@@ -593,7 +320,7 @@ impl AttachmentService {
        &self,
        req: TenantCreateRequest,
    ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
            .await
    }

@@ -620,6 +347,20 @@ impl AttachmentService {
        .await
    }

+    #[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
+    pub async fn tenant_split(
+        &self,
+        tenant_id: TenantId,
+        new_shard_count: u8,
+    ) -> anyhow::Result<TenantShardSplitResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("tenant/{tenant_id}/shard_split"),
+            Some(TenantShardSplitRequest { new_shard_count }),
+        )
+        .await
+    }
+
    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
@@ -650,7 +391,7 @@ impl AttachmentService {
    ) -> anyhow::Result<TimelineInfo> {
        self.dispatch(
            Method::POST,
-            format!("v1/tenant/{tenant_id}/timeline"),
+            format!("tenant/{tenant_id}/timeline"),
            Some(req),
        )
        .await
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -17,7 +17,7 @@ use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
-use std::process::Command;
+use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    envs: EI,
    initial_pid_file: InitialPidFile,
    process_status_check: F,
-) -> anyhow::Result<()>
+) -> anyhow::Result<Child>
 where
    F: Fn() -> Fut,
    Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
        InitialPidFile::Expect(path) => path,
    };

-    let spawned_process = filled_cmd.spawn().with_context(|| {
+    let mut spawned_process = filled_cmd.spawn().with_context(|| {
        format!("Could not spawn {process_name}, see console output and log files for details.")
    })?;
    let pid = spawned_process.id();
@@ -106,26 +106,12 @@ where
        i32::try_from(pid)
            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
    );
-    // set up a scopeguard to kill & wait for the child in case we panic or bail below
-    let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
-        println!("SIGKILL & wait the started process");
-        (|| {
-            // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
-            spawned_process.kill().context("SIGKILL child")?;
-            spawned_process.wait().context("wait() for child process")?;
-            anyhow::Ok(())
-        })()
-        .with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
-        .unwrap();
-    });

    for retries in 0..RETRIES {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
-                println!("\n{process_name} started and passed status check, pid: {pid}");
-                // leak the child process, it'll outlive this neon_local invocation
-                drop(scopeguard::ScopeGuard::into_inner(spawned_process));
-                return Ok(());
+                println!("\n{process_name} started, pid: {pid}");
+                return Ok(spawned_process);
            }
            Ok(false) => {
                if retries == NOTICE_AFTER_RETRIES {
@@ -140,15 +126,16 @@ where
                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
-                println!("error starting process {process_name:?}: {e:#}");
+                println!("{process_name} failed to start: {e:#}");
+                if let Err(e) = spawned_process.kill() {
+                    println!("Could not stop {process_name} subprocess: {e:#}")
+                };
                return Err(e);
            }
        }
    }
    println!();
-    anyhow::bail!(
-        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
-    );
+    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,13 +6,13 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::{
    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
 };
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
+use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::{broker, local_env};
@@ -25,7 +25,6 @@ use pageserver_api::{
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -135,7 +134,7 @@ fn main() -> Result<()> {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
            "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
+            "stop" => handle_stop_all(sub_args, &env),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -366,7 +365,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_one("force").expect("we set a default value");
+    let force = init_match.get_flag("force");
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

@@ -451,9 +450,7 @@ async fn handle_tenant(
                    generation: None,
                    shard_parameters: ShardParameters {
                        count: ShardCount(shard_count),
-                        stripe_size: shard_stripe_size
-                            .map(ShardStripeSize)
-                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                        stripe_size: shard_stripe_size.map(ShardStripeSize),
                    },
                    config: tenant_conf,
                })
@@ -575,6 +572,26 @@ async fn handle_tenant(
            println!("{tenant_table}");
            println!("{shard_table}");
        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+
+            let attachment_service = AttachmentService::from_env(env);
+            let result = attachment_service
+                .tenant_split(tenant_id, shard_count)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -869,11 +886,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let pageserver_id =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
                } else {
-                    None
+                    DEFAULT_PAGESERVER_ID
                };

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
@@ -904,38 +919,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                endpoint.timeline_id,
            )?;

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
-                (
-                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
-                    // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by attachment service, therefore not sharded.
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
-                )
-            } else {
-                // Look up the currently attached location of the tenant, and its striping metadata,
-                // to pass these on to postgres.
-                let attachment_service = AttachmentService::from_env(env);
-                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = locate_result
-                    .shards
-                    .into_iter()
-                    .map(|shard| {
-                        (
-                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Attachment service reported bad hostname"),
-                            shard.listen_pg_port,
-                        )
-                    })
-                    .collect::<Vec<_>>();
-                let stripe_size = locate_result.shard_params.stripe_size;
-
-                (pageservers, stripe_size)
-            };
+            let attachment_service = AttachmentService::from_env(env);
+            let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+            let pageservers = locate_result
+                .shards
+                .into_iter()
+                .map(|shard| {
+                    (
+                        Host::parse(&shard.listen_pg_addr)
+                            .expect("Attachment service reported bad hostname"),
+                        shard.listen_pg_port,
+                    )
+                })
+                .collect::<Vec<_>>();
            assert!(!pageservers.is_empty());
+            let stripe_size = locate_result.shard_params.stripe_size.map(|s| s.0 as usize);

-            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -951,7 +951,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                    safekeepers,
                    pageservers,
                    remote_ext_config,
-                    stripe_size.0 as usize,
+                    stripe_size,
                )
                .await?;
        }
@@ -1056,9 +1056,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args), *register)
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1087,7 +1086,24 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args), false)
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1145,7 +1161,7 @@ async fn handle_attachment_service(
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = svc.stop(immediate).await {
+            if let Err(e) = svc.stop(immediate) {
                eprintln!("stop failed: {}", e);
                exit(1);
            }
@@ -1241,7 +1257,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.start().await {
            eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true).await;
+            try_stop_all(env, true);
            exit(1);
        }
    }
@@ -1249,11 +1265,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match), true)
+            .start(&pageserver_config_overrides(sub_match))
            .await
        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true).await;
+            try_stop_all(env, true);
            exit(1);
        }
    }
@@ -1262,23 +1278,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        let safekeeper = SafekeeperNode::from_env(env, node);
        if let Err(e) = safekeeper.start(vec![]).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false).await;
+            try_stop_all(env, false);
            exit(1);
        }
    }
    Ok(())
 }

-async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let immediate =
        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

-    try_stop_all(env, immediate).await;
+    try_stop_all(env, immediate);

    Ok(())
 }

-async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -1313,7 +1329,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {

    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate).await {
+        if let Err(e) = attachment_service.stop(immediate) {
            eprintln!("attachment service stop failed: {e:#}");
        }
    }
@@ -1417,15 +1433,9 @@ fn cli() -> Command {
        .required(false);

    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(InitForceMode))
+        .value_parser(value_parser!(bool))
        .long("force")
-        .default_value(
-            InitForceMode::MustNotExist
-                .to_possible_value()
-                .unwrap()
-                .get_name()
-                .to_owned(),
-        )
+        .action(ArgAction::SetTrue)
        .help("Force initialization even if the repository is not empty")
        .required(false);

@@ -1524,6 +1534,11 @@ fn cli() -> Command {
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1533,11 +1548,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
-                    .long("register")
-                    .default_value("true").required(false)
-                    .value_parser(value_parser!(bool))
-                    .value_name("register"))
+                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;

 use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,7 +70,6 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    features: Vec<ComputeFeature>,
 }

 //
@@ -141,7 +140,6 @@ impl ComputeControlPlane {
            // with this we basically test a case of waking up an idle compute, where
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates: true,
-            features: vec![],
        });

        ep.create_endpoint_dir()?;
@@ -156,7 +154,6 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                features: vec![],
            })?,
        )?;
        std::fs::write(
@@ -218,9 +215,6 @@ pub struct Endpoint {

    // Optimizations
    skip_pg_catalog_updates: bool,
-
-    // Feature flags
-    features: Vec<ComputeFeature>,
 }

 impl Endpoint {
@@ -250,7 +244,6 @@ impl Endpoint {
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
-            features: conf.features,
        })
    }

@@ -438,7 +431,7 @@ impl Endpoint {
    }

    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
+        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -479,7 +472,7 @@ impl Endpoint {
        safekeepers: Vec<NodeId>,
        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
-        shard_stripe_size: usize,
+        shard_stripe_size: Option<usize>,
    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
@@ -526,7 +519,7 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: self.features.clone(),
+            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -545,7 +538,7 @@ impl Endpoint {
            storage_auth_token: auth_token.clone(),
            remote_extensions,
            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
+            shard_stripe_size,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -558,8 +551,11 @@ impl Endpoint {

        // Launch compute_ctl
        println!("Starting postgres node at '{}'", self.connstr());
-        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        cmd.args(["--http-port", &self.http_address.port().to_string()])
+        let mut cmd = Command::new("/usr/bin/taskset");
+
+        cmd.args(["-c".to_string(), "8-11".to_string()])
+            .args([self.env.neon_distrib_dir.join("compute_ctl")])
+            .args(["--http-port", &self.http_address.port().to_string()])
            .args(["--pgdata", self.pgdata().to_str().unwrap()])
            .args(["--connstr", &self.connstr()])
            .args([
@@ -583,21 +579,9 @@ impl Endpoint {
        }

        let child = cmd.spawn()?;
-        // set up a scopeguard to kill & wait for the child in case we panic or bail below
-        let child = scopeguard::guard(child, |mut child| {
-            println!("SIGKILL & wait the started process");
-            (|| {
-                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
-                child.kill().context("SIGKILL child")?;
-                child.wait().context("wait() for child process")?;
-                anyhow::Ok(())
-            })()
-            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
-            .unwrap();
-        });

        // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
+        // TODO use background_process::start_process instead
        let pid = child.id();
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        std::fs::write(pidfile_path, pid.to_string())?;
@@ -646,9 +630,6 @@ impl Endpoint {
            std::thread::sleep(ATTEMPT_INTERVAL);
        }

-        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
-        drop(scopeguard::ScopeGuard::into_inner(child));
-
        Ok(())
    }

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,7 +5,6 @@

 use anyhow::{bail, ensure, Context};

-use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
    }
 }

-#[derive(Clone, Copy)]
-pub enum InitForceMode {
-    MustNotExist,
-    EmptyDirOk,
-    RemoveAllContents,
-}
-
-impl ValueEnum for InitForceMode {
-    fn value_variants<'a>() -> &'a [Self] {
-        &[
-            Self::MustNotExist,
-            Self::EmptyDirOk,
-            Self::RemoveAllContents,
-        ]
-    }
-
-    fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
-        Some(clap::builder::PossibleValue::new(match self {
-            InitForceMode::MustNotExist => "must-not-exist",
-            InitForceMode::EmptyDirOk => "empty-dir-ok",
-            InitForceMode::RemoveAllContents => "remove-all-contents",
-        }))
-    }
-}
-
 impl SafekeeperConf {
    /// Compute is served by port on which only tenant scoped tokens allowed, if
    /// it is configured.
@@ -223,11 +197,7 @@ impl LocalEnv {
    }

    pub fn attachment_service_bin(&self) -> PathBuf {
-        // Irrespective of configuration, attachment service binary is always
-        // run from the same location as neon_local.  This means that for compatibility
-        // tests that run old pageserver/safekeeper, they still run latest attachment service.
-        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
-        neon_local_bin_dir.join("attachment_service")
+        self.neon_distrib_dir.join("attachment_service")
    }

    pub fn safekeeper_bin(&self) -> PathBuf {
@@ -255,13 +225,7 @@ impl LocalEnv {
        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
            Ok(conf)
        } else {
-            let have_ids = self
-                .pageservers
-                .iter()
-                .map(|node| format!("{}:{}", node.id, node.listen_http_addr))
-                .collect::<Vec<_>>();
-            let joined = have_ids.join(",");
-            bail!("could not find pageserver {id}, have ids {joined}")
+            bail!("could not find pageserver {id}")
        }
    }

@@ -420,7 +384,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -429,34 +393,25 @@ impl LocalEnv {
        );

        if base_path.exists() {
-            match force {
-                InitForceMode::MustNotExist => {
-                    bail!(
-                        "directory '{}' already exists. Perhaps already initialized?",
-                        base_path.display()
-                    );
-                }
-                InitForceMode::EmptyDirOk => {
-                    if let Some(res) = std::fs::read_dir(base_path)?.next() {
-                        res.context("check if directory is empty")?;
-                        anyhow::bail!("directory not empty: {base_path:?}");
-                    }
-                }
-                InitForceMode::RemoveAllContents => {
-                    println!("removing all contents of '{}'", base_path.display());
-                    // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                    // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                    // S3 local SSD) to the `.neon` base directory.
-                    for entry in std::fs::read_dir(base_path)? {
-                        let entry = entry?;
-                        let path = entry.path();
-                        if path.is_dir() {
-                            fs::remove_dir_all(&path)?;
-                        } else {
-                            fs::remove_file(&path)?;
-                        }
+            if force {
+                println!("removing all contents of '{}'", base_path.display());
+                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
+                // all contents inside. This helps if the developer symbol links another directory (i.e.,
+                // S3 local SSD) to the `.neon` base directory.
+                for entry in std::fs::read_dir(base_path)? {
+                    let entry = entry?;
+                    let path = entry.path();
+                    if path.is_dir() {
+                        fs::remove_dir_all(&path)?;
+                    } else {
+                        fs::remove_file(&path)?;
                    }
                }
+            } else {
+                bail!(
+                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
+                    base_path.display()
+                );
            }
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,7 +11,8 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::Command;
+use std::process::{Child, Command};
+use std::str::FromStr;
 use std::time::Duration;

 use anyhow::{bail, Context};
@@ -30,7 +31,6 @@ use utils::{
    lsn::Lsn,
 };

-use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -109,16 +109,6 @@ impl PageServerNode {
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
-
-            // Attachment service uses the same auth as pageserver: if JWT is enabled
-            // for us, we will also need it to talk to them.
-            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
-                let jwt_token = self
-                    .env
-                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
-                    .unwrap();
-                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
-            }
        }

        if !cli_overrides
@@ -162,8 +152,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false, register).await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+        self.start_node(config_overrides, false).await
    }

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -208,8 +198,7 @@ impl PageServerNode {
        &self,
        config_overrides: &[&str],
        update_config: bool,
-        register: bool,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Child> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -230,11 +219,19 @@ impl PageServerNode {
        if update_config {
            args.push(Cow::Borrowed("--update-config"));
        }
+
+        let mut taskset_args = vec![
+            "-c".to_string(),
+            format!("{}", self.conf.id.0 - 1),
+            self.env.pageserver_bin().to_string_lossy().into(),
+        ];
+        taskset_args.extend(args.into_iter().map(|a| a.to_string()));
+
        background_process::start_process(
            "pageserver",
            &datadir,
-            &self.env.pageserver_bin(),
-            args.iter().map(Cow::as_ref),
+            &PathBuf::from_str("/usr/bin/taskset").unwrap(),
+            taskset_args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
@@ -246,26 +243,7 @@ impl PageServerNode {
                }
            },
        )
-        .await?;
-
-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
-        Ok(())
+        .await
    }

    fn pageserver_basic_args<'a>(
@@ -528,16 +506,9 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .tenant_secondary_download(*tenant_id)
-            .await?)
-    }
-
    pub async fn timeline_create(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        new_timeline_id: TimelineId,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
@@ -551,10 +522,7 @@ impl PageServerNode {
            pg_version,
            existing_initdb_timeline_id,
        };
-        Ok(self
-            .http_client
-            .timeline_create(tenant_shard_id, &req)
-            .await?)
+        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
    }

    /// Import a basebackup prepared using either:
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,6 +7,7 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
+use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
@@ -103,7 +104,7 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
--- a/diesel.toml
+++ b/diesel.toml
@@ -1,9 +0,0 @@
-# For documentation on how to configure this file,
-# see https://diesel.rs/guides/configuring-diesel-cli
-
-[print_schema]
-file = "control_plane/attachment_service/src/schema.rs"
-custom_type_derives = ["diesel::query_builder::QueryId"]
-
-[migrations_directory]
-dir = "control_plane/attachment_service/migrations"
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -1,142 +0,0 @@
-# Vectored Timeline Get
-
-Created on: 2024-01-02
-Author: Christian Schwarz
-
-# Summary
-
-A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
-
-# Motivation
-
-During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
-For an example, see
-https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
-
-Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
-For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
-If it's negative (no entry), we resume layer map traversal.
-If it's positive, we collect the result in our reconstruct data bag.
-If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
-Otherwise, we resume layer map traversal.
-
-Doing this many `Timeline::get` calls is quite inefficient because:
-
-1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
-2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
-   This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
-   may not contain the data we're looking for.
-3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
-   So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
-   The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
-
-[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
-
-# Solution
-
-We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API  unlocks:
-
-* more efficient basebackup
-* batched IO during compaction (useful for strides of unchanged pages)
-* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
-  * if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
-
-# DoD
-
-There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
-It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
-
-It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
-It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
-
-Functionally, the behavior of `Timeline::get_vectored` is equivalent to
-
-```rust
-let mut keys_iter: impl Iterator<Item=Key>
-  = src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
-let mut out = Vec::new();
-for key in keys_iter {
-    let data = Timeline::get(key, lsn)?;
-    out.push(data);
-}
-return out;
-```
-
-However, unlike above, an ideal solution will
-
-* Visit each `struct Layer` at most once.
-* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
-  * This means, read each `DiskBtree` page at most once.
-* Facilitate merging of the reads we issue to the OS and eventually NVMe.
-
-Each of these items above represents a signficant amount of work.
-
-## Performance
-
-Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
-A reasonable constant overhead over current `Timeline::get` is acceptable.
-
-The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
-
-# Implementation
-
-High-level set of tasks / changes to be made:
-
- **Get clarity on API**:
-  - Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
-  - The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
-  - Start with something simple to explore the different usages of the API.
-    Then iterate with peers until we have something that is good enough.
- **Vectored Layer Map traversal**
-  - Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
-  - Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
-    - The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
-      but need more.
-      Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
-  - Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
-  - Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
-  - Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
-  - What needs to happen to `DiskBtree::visit()`?
-    * Minimally
-      * take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
-      * Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
-      * This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
-    * Ideally:
-      * Take a `&[KeyVec]`, sort it;
-      * during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
-      * NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
-  - The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
-    - [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
-      - We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
-    - [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
-  - What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
-  - That is tricky because
-    - the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
-    - there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
-      - The guiding principle here should be to avoid coupling this work to the `PageCache`.
-      - I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
-
-
-Let's see how we can improve by doing the first three items in above list first, then revisit.
-
-## Rollout / Feature Flags
-
-No feature flags are required for this epic.
-
-At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
-
-It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
-That will help isolate performance regressions across weekly releases.
-
-# Interaction With Sharding
-
-[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
-
-Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
-
-Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
-
-However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
-For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `ruff`, and type hints via `mypy`.
+We force code formatting via `black`, `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run ruff format . # All code is reformatted
-poetry run ruff check .  # Python linter
-poetry run mypy .        # Ensure there are no typing errors
+poetry run black .  # All code is reformatted
+poetry run ruff .  # Python linter
+poetry run mypy .  # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -77,7 +77,6 @@ pub struct ComputeSpec {
    pub pgbouncer_settings: Option<HashMap<String, String>>,

    // Stripe size for pageserver sharding, in pages
-    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
 }

@@ -86,16 +85,10 @@ pub struct ComputeSpec {
 #[serde(rename_all = "snake_case")]
 pub enum ComputeFeature {
    // XXX: Add more feature flags here.
-    /// Enable the experimental activity monitor logic, which uses `pg_stat_database` to
-    /// track short-lived connections as user activity.
-    ActivityMonitorExperimental,

-    /// Enable running migrations
-    Migrations,
-
-    /// This is a special feature flag that is used to represent unknown feature flags.
-    /// Basically all unknown to enum flags are represented as this one. See unit test
-    /// `parse_unknown_features()` for more details.
+    // This is a special feature flag that is used to represent unknown feature flags.
+    // Basically all unknown to enum flags are represented as this one. See unit test
+    // `parse_unknown_features()` for more details.
    #[serde(other)]
    UnknownFeature,
 }
@@ -292,23 +285,4 @@ mod tests {
        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
    }
-
-    #[test]
-    fn parse_known_features() {
-        // Test that we can properly parse known feature flags.
-        let file = File::open("tests/cluster_spec.json").unwrap();
-        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
-        let ob = json.as_object_mut().unwrap();
-
-        // Add known feature flags.
-        let features = vec!["activity_monitor_experimental"];
-        ob.insert("features".into(), features.into());
-
-        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
-
-        assert_eq!(
-            spec.features,
-            vec![ComputeFeature::ActivityMonitorExperimental]
-        );
-    }
 }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -19,7 +19,6 @@ strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
-humantime-serde.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,11 +1,7 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::{fmt, ops::Range};
-
-use crate::reltag::{BlockNumber, RelTag, SlruKind};
+use std::fmt;

 /// Key used in the Repository kv-store.
 ///
@@ -145,403 +141,8 @@ impl Key {
    }
 }

-// Layout of the Key address space
-//
-// The Key struct, used to address the underlying key-value store, consists of
-// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
-// all the data and metadata keys into those 18 bytes.
-//
-// Principles for the mapping:
-//
-// - Things that are often accessed or modified together, should be close to
-//   each other in the key space. For example, if a relation is extended by one
-//   block, we create a new key-value pair for the block data, and update the
-//   relation size entry. Because of that, the RelSize key comes after all the
-//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
-//   to each other.
-//
-// The key space is divided into four major sections, identified by the first
-// byte, and the form a hierarchy:
-//
-// 00 Relation data and metadata
-//
-//   DbDir    () -> (dbnode, spcnode)
-//   Filenodemap
-//   RelDir   -> relnode forknum
-//       RelBlocks
-//       RelSize
-//
-// 01 SLRUs
-//
-//   SlruDir  kind
-//   SlruSegBlocks segno
-//   SlruSegSize
-//
-// 02 pg_twophase
-//
-// 03 misc
-//    Controlfile
-//    checkpoint
-//    pg_version
-//
-// 04 aux files
-//
-// Below is a full list of the keyspace allocation:
-//
-// DbDir:
-// 00 00000000 00000000 00000000 00   00000000
-//
-// Filenodemap:
-// 00 SPCNODE  DBNODE   00000000 00   00000000
-//
-// RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
-//
-// RelBlock:
-// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
-//
-// RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
-//
-// SlruDir:
-// 01 kind     00000000 00000000 00   00000000
-//
-// SlruSegBlock:
-// 01 kind     00000001 SEGNO    00   BLKNUM
-//
-// SlruSegSize:
-// 01 kind     00000001 SEGNO    00   FFFFFFFF
-//
-// TwoPhaseDir:
-// 02 00000000 00000000 00000000 00   00000000
-//
-// TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
-//
-// ControlFile:
-// 03 00000000 00000000 00000000 00   00000000
-//
-// Checkpoint:
-// 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
-//-- Section 01: relation data and metadata
-
-pub const DBDIR_KEY: Key = Key {
-    field1: 0x00,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-#[inline(always)]
-pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0xffffffff,
-        field5: 0xff,
-        field6: 0xffffffff,
-    }
-}
-
-#[inline(always)]
-pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-#[inline(always)]
-pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 1,
-    }
-}
-
-#[inline(always)]
-pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-
-#[inline(always)]
-pub fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
-#[inline(always)]
-pub fn rel_key_range(rel: RelTag) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum + 1,
-        field6: 0,
-    }
-}
-
-//-- Section 02: SLRUs
-
-#[inline(always)]
-pub fn slru_dir_to_key(kind: SlruKind) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-#[inline(always)]
-pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: blknum,
-    }
-}
-
-#[inline(always)]
-pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0xffffffff,
-    }
-}
-
-#[inline(always)]
-pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
-    let field2 = match kind {
-        SlruKind::Clog => 0x00,
-        SlruKind::MultiXactMembers => 0x01,
-        SlruKind::MultiXactOffsets => 0x02,
-    };
-
-    Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 1,
-        field6: 0,
-    }
-}
-
-//-- Section 03: pg_twophase
-
-pub const TWOPHASEDIR_KEY: Key = Key {
-    field1: 0x02,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-#[inline(always)]
-pub fn twophase_file_key(xid: TransactionId) -> Key {
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }
-}
-
-#[inline(always)]
-pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
-    let (next_xid, overflowed) = xid.overflowing_add(1);
-
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }..Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
-    }
-}
-
-//-- Section 03: Control file
-pub const CONTROLFILE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-pub const CHECKPOINT_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 1,
-};
-
-pub const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
-// Reverse mappings for a few Keys.
-// These are needed by WAL redo manager.
-
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-#[inline(always)]
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
-#[inline(always)]
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
-
-#[inline(always)]
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
-
-#[inline(always)]
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
-
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
-
-#[inline(always)]
-pub fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
-
-#[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
-}
-
-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-#[inline(always)]
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
+    key.field1 == 0x00 && key.field4 != 0
 }

 impl std::str::FromStr for Key {
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -104,7 +104,6 @@ pub struct KeySpaceAccum {
    accum: Option<Range<Key>>,

    ranges: Vec<Range<Key>>,
-    size: u64,
 }

 impl KeySpaceAccum {
@@ -112,19 +111,14 @@ impl KeySpaceAccum {
        Self {
            accum: None,
            ranges: Vec::new(),
-            size: 0,
        }
    }

-    #[inline(always)]
    pub fn add_key(&mut self, key: Key) {
        self.add_range(singleton_range(key))
    }

-    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += key_range_size(&range) as u64;
-
        match self.accum.as_mut() {
            Some(accum) => {
                if range.start == accum.end {
@@ -150,23 +144,6 @@ impl KeySpaceAccum {
            ranges: self.ranges,
        }
    }
-
-    pub fn consume_keyspace(&mut self) -> KeySpace {
-        if let Some(accum) = self.accum.take() {
-            self.ranges.push(accum);
-        }
-
-        let mut prev_accum = KeySpaceAccum::new();
-        std::mem::swap(self, &mut prev_accum);
-
-        KeySpace {
-            ranges: prev_accum.ranges,
-        }
-    }
-
-    pub fn size(&self) -> u64 {
-        self.size
-    }
 }

 ///
@@ -275,30 +252,6 @@ mod tests {
        }
    }

-    #[test]
-    fn keyspace_consume() {
-        let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
-
-        let mut accum = KeySpaceAccum::new();
-        for range in &ranges {
-            accum.add_range(range.clone());
-        }
-
-        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
-        assert_eq!(accum.size(), expected_size);
-
-        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.size(), 0);
-
-        assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.size(), 0);
-
-        for range in &ranges {
-            accum.add_range(range.clone());
-        }
-        assert_ks_eq(&accum.to_keyspace(), ranges);
-    }
-
    #[test]
    fn keyspace_add_range() {
        // two separate ranges
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,9 +2,9 @@ pub mod partitioning;

 use std::{
    collections::HashMap,
-    io::{BufRead, Read},
+    io::Read,
    num::{NonZeroU64, NonZeroUsize},
-    time::{Duration, SystemTime},
+    time::SystemTime,
 };

 use byteorder::{BigEndian, ReadBytesExt};
@@ -191,17 +191,27 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitRequest {
+    pub new_shard_count: u8,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantShardSplitResponse {
+    pub new_shards: Vec<TenantShardId>,
+}
+
 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct ShardParameters {
    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stripe_size: Option<ShardStripeSize>,
 }

 impl ShardParameters {
-    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
-
    pub fn is_unsharded(&self) -> bool {
        self.count == ShardCount(0)
    }
@@ -211,7 +221,7 @@ impl Default for ShardParameters {
    fn default() -> Self {
        Self {
            count: ShardCount(0),
-            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+            stripe_size: None,
        }
    }
 }
@@ -266,37 +276,17 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
-    pub eviction_policy: Option<EvictionPolicy>,
+    // We defer the parsing of the eviction_policy field to the request handler.
+    // Otherwise we'd have to move the types for eviction policy into this package.
+    // We might do that once the eviction feature has stabilizied.
+    // For now, this field is not even documented in the openapi_spec.yml.
+    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum EvictionPolicy {
-    NoEviction,
-    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
-}
-
-impl EvictionPolicy {
-    pub fn discriminant_str(&self) -> &'static str {
-        match self {
-            EvictionPolicy::NoEviction => "NoEviction",
-            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct EvictionPolicyLayerAccessThreshold {
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[serde(with = "humantime_serde")]
-    pub threshold: Duration,
-}
-
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
@@ -322,8 +312,6 @@ pub struct LocationConfig {
    /// If attaching, in what generation?
    #[serde(default)]
    pub generation: Option<u32>,
-
-    // If requesting mode `Secondary`, configuration for that.
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

@@ -336,17 +324,11 @@ pub struct LocationConfig {
    #[serde(default)]
    pub shard_stripe_size: u32,

-    // This configuration only affects attached mode, but should be provided irrespective
-    // of the mode, as a secondary location might transition on startup if the response
-    // to the `/re-attach` control plane API requests it.
+    // If requesting mode `Secondary`, configuration for that.
+    // Custom storage configuration for the tenant, if any
    pub tenant_conf: TenantConfig,
 }

-#[derive(Serialize, Deserialize)]
-pub struct LocationConfigListResponse {
-    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
-}
-
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TenantCreateResponse(pub TenantId);
@@ -430,8 +412,6 @@ pub struct TenantInfo {
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -888,10 +868,9 @@ impl PagestreamBeMessage {
                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
                }
                Tag::Error => {
-                    let mut msg = Vec::new();
-                    buf.read_until(0, &mut msg)?;
-                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
-                    let rust_str = cstring.to_str()?;
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
                    PagestreamBeMessage::Error(PagestreamErrorResponse {
                        message: rust_str.to_owned(),
                    })
@@ -985,7 +964,6 @@ mod tests {
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -1006,7 +984,6 @@ mod tests {
            },
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
-            generation: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -32,9 +32,6 @@ pub struct RelTag {
    pub relnode: Oid,
 }

-/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
-pub type BlockNumber = u32;
-
 impl PartialOrd for RelTag {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
@@ -111,19 +108,7 @@ impl RelTag {
 /// These files are divided into segments, which are divided into
 /// pages of the same BLCKSZ as used for relation files.
 ///
-#[derive(
-    Debug,
-    Clone,
-    Copy,
-    Hash,
-    Serialize,
-    Deserialize,
-    PartialEq,
-    Eq,
-    PartialOrd,
-    Ord,
-    strum_macros::EnumIter,
-)]
+#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
 pub enum SlruKind {
    Clog,
    MultiXactMembers,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -88,6 +88,10 @@ impl TenantShardId {
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
    pub fn to_index(&self) -> ShardIndex {
        ShardIndex {
            shard_number: self.shard_number,
@@ -419,7 +423,7 @@ impl ShardIdentity {
            number,
            count: params.count,
            layout: LAYOUT_V1,
-            stripe_size: params.stripe_size,
+            stripe_size: params.stripe_size.unwrap_or(DEFAULT_STRIPE_SIZE),
        }
    }

@@ -550,7 +554,12 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,12 +35,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Query handler indicated that client should reconnect
-    #[error("Server requested reconnect")]
-    Reconnect,
-    /// Query named an entity that was not found
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                info!("Stopped due to shutdown");
                Ok(())
            }
-            Err(QueryError::Reconnect) => {
-                // Dropping out of this loop implicitly disconnects
-                info!("Stopped due to handler reconnect request");
-                Ok(())
-            }
            Err(QueryError::Disconnected(e)) => {
                info!("Disconnected ({e:#})");
                // Disconnection is not an error: we just use it that way internally to drop
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Reconnect => "reconnect".to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::NotFound(_) => "not found".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
-        QueryError::Reconnect => {
-            info!("query handler for '{query}' requested client to reconnect")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::NotFound(reason) => {
-            info!("query handler for '{query}' entity not found: {reason}")
-        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -329,8 +329,8 @@ impl CheckPoint {
    ///
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
-        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround.
+        let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
        new_xid =
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -5,10 +5,7 @@ use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::pin::Pin;
-use std::str::FromStr;
 use std::sync::Arc;
-use std::time::Duration;
-use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -16,15 +13,12 @@ use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
-use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
-use http_types::{StatusCode, Url};
-use tokio::time::Instant;
-use tokio_util::sync::CancellationToken;
+use http_types::StatusCode;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -185,6 +179,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
    }
 }

+#[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
    async fn list(
        &self,
@@ -327,65 +322,6 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
-
-        let source_url = format!(
-            "{}/{}",
-            self.client.url()?,
-            self.relative_path_to_name(from)
-        );
-        let builder = blob_client.copy(Url::from_str(&source_url)?);
-
-        let result = builder.into_future().await?;
-
-        let mut copy_status = result.copy_status;
-        let start_time = Instant::now();
-        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
-        loop {
-            match copy_status {
-                CopyStatus::Aborted => {
-                    anyhow::bail!("Received abort for copy from {from} to {to}.");
-                }
-                CopyStatus::Failed => {
-                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
-                }
-                CopyStatus::Success => return Ok(()),
-                CopyStatus::Pending => (),
-            }
-            // The copy is taking longer. Waiting a second and then re-trying.
-            // TODO estimate time based on copy_progress and adjust time based on that
-            tokio::time::sleep(Duration::from_millis(1000)).await;
-            let properties = blob_client.get_properties().into_future().await?;
-            let Some(status) = properties.blob.properties.copy_status else {
-                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
-                return Ok(());
-            };
-            if start_time.elapsed() > MAX_WAIT_TIME {
-                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
-                    MAX_WAIT_TIME.as_secs_f32(),
-                    properties.blob.properties.copy_progress,
-                );
-            }
-            copy_status = status;
-        }
-    }
-
-    async fn time_travel_recover(
-        &self,
-        _prefix: Option<&RemotePath>,
-        _timestamp: SystemTime,
-        _done_if_after: SystemTime,
-        _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        // TODO use Azure point in time recovery feature for this
-        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
-        Err(anyhow::anyhow!(
-            "time travel recovery for azure blob storage is not implemented"
-        ))
-    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -25,7 +25,6 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
-use tokio_util::sync::CancellationToken;
 use toml_edit::Item;
 use tracing::info;

@@ -143,7 +142,7 @@ pub struct Listing {
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
-#[allow(async_fn_in_trait)]
+#[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync + 'static {
    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -208,18 +207,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
-
-    /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
-
-    /// Resets the content of everything with the given prefix to the given state
-    async fn time_travel_recover(
-        &self,
-        prefix: Option<&RemotePath>,
-        timestamp: SystemTime,
-        done_if_after: SystemTime,
-        cancel: CancellationToken,
-    ) -> anyhow::Result<()>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -272,15 +259,14 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
-pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
+pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
    AzureBlob(Arc<AzureBlobStorage>),
-    Unreliable(Other),
+    Unreliable(Arc<UnreliableWrapper>),
 }

-impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
+impl GenericRemoteStorage {
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -388,42 +374,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
-
-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
-        }
-    }
-
-    pub async fn time_travel_recover(
-        &self,
-        prefix: Option<&RemotePath>,
-        timestamp: SystemTime,
-        done_if_after: SystemTime,
-        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
-                    .await
-            }
-            Self::AwsS3(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
-                    .await
-            }
-            Self::AzureBlob(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
-                    .await
-            }
-            Self::Unreliable(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
-                    .await
-            }
-        }
-    }
 }

 impl GenericRemoteStorage {
@@ -710,8 +660,6 @@ impl ConcurrencyLimiter {
            RequestKind::Put => &self.write,
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
-            RequestKind::Copy => &self.write,
-            RequestKind::TimeTravel => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::{io::ReaderStream, sync::CancellationToken};
+use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -157,6 +157,7 @@ impl LocalFs {
    }
 }

+#[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
    async fn list(
        &self,
@@ -408,31 +409,6 @@ impl RemoteStorage for LocalFs {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let from_path = from.with_base(&self.storage_root);
-        let to_path = to.with_base(&self.storage_root);
-        create_target_directory(&to_path).await?;
-        fs::copy(&from_path, &to_path).await.with_context(|| {
-            format!(
-                "Failed to copy file from '{from_path}' to '{to_path}'",
-                from_path = from_path,
-                to_path = to_path
-            )
-        })?;
-        Ok(())
-    }
-
-    #[allow(clippy::diverging_sub_expression)]
-    async fn time_travel_recover(
-        &self,
-        _prefix: Option<&RemotePath>,
-        _timestamp: SystemTime,
-        _done_if_after: SystemTime,
-        _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        unimplemented!()
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -6,14 +6,12 @@

 use std::{
    borrow::Cow,
-    collections::HashMap,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
-    time::SystemTime,
 };

-use anyhow::{anyhow, Context as _};
+use anyhow::Context as _;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -29,19 +27,17 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

+use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
-use aws_smithy_types::{body::SdkBody, DateTime};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio_util::sync::CancellationToken;
-use utils::backoff;

 use super::StorageMetadata;
 use crate::{
@@ -274,59 +270,6 @@ impl S3Bucket {
            }
        }
    }
-
-    async fn delete_oids(
-        &self,
-        kind: RequestKind,
-        delete_objects: &[ObjectIdentifier],
-    ) -> anyhow::Result<()> {
-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
-            let started_at = start_measuring_requests(kind);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(
-                    Delete::builder()
-                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
-                )
-                .send()
-                .await;
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
-            let resp = resp?;
-            metrics::BUCKET_METRICS
-                .deleted_objects_total
-                .inc_by(chunk.len() as u64);
-            if let Some(errors) = resp.errors {
-                // Log a bounded number of the errors within the response:
-                // these requests can carry 1000 keys so logging each one
-                // would be too verbose, especially as errors may lead us
-                // to retry repeatedly.
-                const LOG_UP_TO_N_ERRORS: usize = 10;
-                for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                    tracing::warn!(
-                        "DeleteObjects key {} failed: {}: {}",
-                        e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                        e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                        e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                    );
-                }
-
-                return Err(anyhow::format_err!(
-                    "Failed to delete {} objects",
-                    errors.len()
-                ));
-            }
-        }
-        Ok(())
-    }
 }

 pin_project_lite::pin_project! {
@@ -430,6 +373,7 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
    }
 }

+#[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    async fn list(
        &self,
@@ -549,38 +493,6 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
-
-        let started_at = start_measuring_requests(kind);
-
-        // we need to specify bucket_name as a prefix
-        let copy_source = format!(
-            "{}/{}",
-            self.bucket_name,
-            self.relative_path_to_s3_object(from)
-        );
-
-        let res = self
-            .client
-            .copy_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(to))
-            .copy_source(copy_source)
-            .send()
-            .await;
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
-        Ok(())
-    }
-
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
        // if prefix is none then download file `from`
@@ -625,168 +537,64 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        self.delete_oids(kind, &delete_objects).await
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+            let started_at = start_measuring_requests(kind);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
+                .send()
+                .await;
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
+            match resp {
+                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
+                    if let Some(errors) = resp.errors {
+                        // Log a bounded number of the errors within the response:
+                        // these requests can carry 1000 keys so logging each one
+                        // would be too verbose, especially as errors may lead us
+                        // to retry repeatedly.
+                        const LOG_UP_TO_N_ERRORS: usize = 10;
+                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                            tracing::warn!(
+                                "DeleteObjects key {} failed: {}: {}",
+                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                            );
+                        }
+
+                        return Err(anyhow::format_err!(
+                            "Failed to delete {} objects",
+                            errors.len()
+                        ));
+                    }
+                }
+                Err(e) => {
+                    return Err(e.into());
+                }
+            }
+        }
+        Ok(())
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths).await
    }
-
-    async fn time_travel_recover(
-        &self,
-        prefix: Option<&RemotePath>,
-        timestamp: SystemTime,
-        done_if_after: SystemTime,
-        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        let kind = RequestKind::TimeTravel;
-        let _guard = self.permit(kind).await;
-
-        let timestamp = DateTime::from(timestamp);
-        let done_if_after = DateTime::from(done_if_after);
-
-        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
-
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let prefix = prefix
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
-
-        let warn_threshold = 3;
-        let max_retries = 10;
-        let is_permanent = |_e: &_| false;
-
-        let list = backoff::retry(
-            || async {
-                Ok(self
-                    .client
-                    .list_object_versions()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(prefix.clone())
-                    .send()
-                    .await?)
-            },
-            is_permanent,
-            warn_threshold,
-            max_retries,
-            "listing object versions for time_travel_recover",
-            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-        )
-        .await?;
-
-        if list.is_truncated().unwrap_or_default() {
-            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
-        }
-
-        let mut versions_deletes = list
-            .versions()
-            .iter()
-            .map(VerOrDelete::Version)
-            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
-            .collect::<Vec<_>>();
-
-        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
-
-        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
-
-        for vd in versions_deletes {
-            let last_modified = vd.last_modified();
-            let version_id = vd.version_id();
-            let key = vd.key();
-            let (Some(last_modified), Some(version_id), Some(key)) =
-                (last_modified, version_id, key)
-            else {
-                anyhow::bail!(
-                    "One (or more) of last_modified, key, and id is None. \
-                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
-                    last_modified, key, version_id,
-                );
-            };
-            if version_id == "null" {
-                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
-                    indicating either disabled versioning, or legacy objects with null version id values");
-            }
-            tracing::trace!(
-                "Parsing version key={key} version_id={version_id} is_delete={}",
-                matches!(vd, VerOrDelete::DeleteMarker(_))
-            );
-
-            vds_for_key
-                .entry(key)
-                .or_default()
-                .push((vd, last_modified, version_id));
-        }
-        for (key, versions) in vds_for_key {
-            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
-            if last_last_modified > &&done_if_after {
-                tracing::trace!("Key {key} has version later than done_if_after, skipping");
-                continue;
-            }
-            // the version we want to restore to.
-            let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
-                    Ok(v) => v,
-                    Err(e) => e,
-                };
-            if version_to_restore_to == versions.len() {
-                tracing::trace!("Key {key} has no changes since timestamp, skipping");
-                continue;
-            }
-            let mut do_delete = false;
-            if version_to_restore_to == 0 {
-                // All versions more recent, so the key didn't exist at the specified time point.
-                tracing::trace!(
-                    "All {} versions more recent for {key}, deleting",
-                    versions.len()
-                );
-                do_delete = true;
-            } else {
-                match &versions[version_to_restore_to - 1] {
-                    (VerOrDelete::Version(_), _last_modified, version_id) => {
-                        tracing::trace!("Copying old version {version_id} for {key}...");
-                        // Restore the state to the last version by copying
-                        let source_id =
-                            format!("{}/{key}?versionId={version_id}", self.bucket_name);
-
-                        backoff::retry(
-                            || async {
-                                Ok(self
-                                    .client
-                                    .copy_object()
-                                    .bucket(self.bucket_name.clone())
-                                    .key(key)
-                                    .copy_source(&source_id)
-                                    .send()
-                                    .await?)
-                            },
-                            is_permanent,
-                            warn_threshold,
-                            max_retries,
-                            "listing object versions for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-                        )
-                        .await?;
-                    }
-                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
-                        do_delete = true;
-                    }
-                }
-            };
-            if do_delete {
-                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
-                    // Key has since been deleted (but there was some history), no need to do anything
-                    tracing::trace!("Key {key} already deleted, skipping.");
-                } else {
-                    tracing::trace!("Deleting {key}...");
-
-                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
-                    self.delete_oids(kind, &[oid]).await?;
-                }
-            }
-        }
-        Ok(())
-    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -811,32 +619,6 @@ fn start_measuring_requests(
    })
 }

-enum VerOrDelete<'a> {
-    Version(&'a ObjectVersion),
-    DeleteMarker(&'a DeleteMarkerEntry),
-}
-
-impl<'a> VerOrDelete<'a> {
-    fn last_modified(&self) -> Option<&'a DateTime> {
-        match self {
-            VerOrDelete::Version(v) => v.last_modified(),
-            VerOrDelete::DeleteMarker(v) => v.last_modified(),
-        }
-    }
-    fn version_id(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.version_id(),
-            VerOrDelete::DeleteMarker(v) => v.version_id(),
-        }
-    }
-    fn key(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.key(),
-            VerOrDelete::DeleteMarker(v) => v.key(),
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use camino::Utf8Path;
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,8 +11,6 @@ pub(crate) enum RequestKind {
    Put = 1,
    Delete = 2,
    List = 3,
-    Copy = 4,
-    TimeTravel = 5,
 }

 use RequestKind::*;
@@ -24,8 +22,6 @@ impl RequestKind {
            Put => "put_object",
            Delete => "delete_object",
            List => "list_objects",
-            Copy => "copy_object",
-            TimeTravel => "time_travel_recover",
        }
    }
    const fn as_index(&self) -> usize {
@@ -33,7 +29,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 6]);
+pub(super) struct RequestTyped<C>([C; 4]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -42,8 +38,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
-        let arr = std::array::from_fn::<C, 6, _>(|index| {
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,19 +3,16 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
+use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
-use std::time::SystemTime;
-use std::{collections::hash_map::Entry, sync::Arc};
-use tokio_util::sync::CancellationToken;

 use crate::{
-    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
 };

 pub struct UnreliableWrapper {
-    inner: GenericRemoteStorage<Arc<VoidStorage>>,
+    inner: crate::GenericRemoteStorage,

    // This many attempts of each operation will fail, then we let it succeed.
    attempts_to_fail: u64,
@@ -32,21 +29,11 @@ enum RemoteOp {
    Download(RemotePath),
    Delete(RemotePath),
    DeleteObjects(Vec<RemotePath>),
-    TimeTravelRecover(Option<RemotePath>),
 }

 impl UnreliableWrapper {
    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
        assert!(attempts_to_fail > 0);
-        let inner = match inner {
-            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
-            GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
-            GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
-            // We could also make this a no-op, as in, extract the inner of the passed generic remote storage
-            GenericRemoteStorage::Unreliable(_s) => {
-                panic!("Can't wrap unreliable wrapper unreliably")
-            }
-        };
        UnreliableWrapper {
            inner,
            attempts_to_fail,
@@ -97,9 +84,7 @@ impl UnreliableWrapper {
    }
 }

-// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
-type VoidStorage = crate::LocalFs;
-
+#[async_trait::async_trait]
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
@@ -177,24 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
        }
        Ok(())
    }
-
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        // copy is equivalent to download + upload
-        self.attempt(RemoteOp::Download(from.clone()))?;
-        self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
-    }
-
-    async fn time_travel_recover(
-        &self,
-        prefix: Option<&RemotePath>,
-        timestamp: SystemTime,
-        done_if_after: SystemTime,
-        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
-        self.inner
-            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
-            .await
-    }
 }
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,288 +0,0 @@
-use anyhow::Context;
-use camino::Utf8Path;
-use remote_storage::RemotePath;
-use std::collections::HashSet;
-use std::sync::Arc;
-use test_context::test_context;
-use tracing::debug;
-
-use crate::common::{download_to_vec, upload_stream, wrap_stream};
-
-use super::{
-    MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
-};
-
-/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
-/// See the client creation in [`create_s3_client`] for details on the required env vars.
-/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
-///
-/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledStorageWithTestBlobs)]
-#[tokio::test]
-async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `s3_pagination_should_work` for more information.
-///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
-#[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorage::Enabled(ctx) => ctx,
-        MaybeEnabledStorage::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledStorage::Enabled(ctx) => ctx,
-        MaybeEnabledStorage::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledStorage)]
-#[tokio::test]
-async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
-    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-    let path_dest = RemotePath::new(Utf8Path::new(
-        format!("{}/file_dest", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
-
-    let dl = ctx.client.download(&path_dest).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -6,23 +6,263 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::{
    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
-use test_context::AsyncTestContext;
-use tracing::info;
+use test_context::{test_context, AsyncTestContext};
+use tracing::{debug, info};

 mod common;

-#[path = "common/tests.rs"]
-mod tests_azure;
-
-use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};

 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

+/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
+/// See the client creation in [`create_azure_client`] for details on the required env vars.
+/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// where
+/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
+///
+/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledAzureWithTestBlobs)]
+#[tokio::test]
+async fn azure_pagination_should_work(
+    ctx: &mut MaybeEnabledAzureWithTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `Azure_pagination_should_work` for more information.
+///
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
+#[tokio::test]
+async fn azure_list_files_works(
+    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
 struct EnabledAzure {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -41,13 +281,13 @@ impl EnabledAzure {
    }
 }

-enum MaybeEnabledStorage {
+enum MaybeEnabledAzure {
    Enabled(EnabledAzure),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorage {
+impl AsyncTestContext for MaybeEnabledAzure {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -63,7 +303,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
    }
 }

-enum MaybeEnabledStorageWithTestBlobs {
+enum MaybeEnabledAzureWithTestBlobs {
    Enabled(AzureWithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
@@ -76,7 +316,7 @@ struct AzureWithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
+impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -127,7 +367,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledStorageWithSimpleTestBlobs {
+enum MaybeEnabledAzureWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
@@ -138,7 +378,7 @@ struct AzureWithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,80 +1,173 @@
+use std::collections::HashSet;
 use std::env;
-use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::{Duration, UNIX_EPOCH};
-use std::{collections::HashSet, time::SystemTime};
+use std::time::UNIX_EPOCH;

-use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
-use futures_util::Future;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
-use test_context::test_context;
-use test_context::AsyncTestContext;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
+use test_context::{test_context, AsyncTestContext};
+use tracing::{debug, info};

 mod common;

-#[path = "common/tests.rs"]
-mod tests_s3;
-
-use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
-use utils::backoff;
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-#[test_context(MaybeEnabledStorage)]
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
-async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
-        MaybeEnabledStorage::Enabled(ctx) => ctx,
-        MaybeEnabledStorage::Disabled => return Ok(()),
+        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
    };
-    // Our test depends on discrepancies in the clock between S3 and the environment the tests
-    // run in. Therefore, wait a little bit before and after. The alternative would be
-    // to take the time from S3 response headers.
-    const WAIT_TIME: Duration = Duration::from_millis(3_000);

-    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
-    where
-        E: Display + Debug + 'static,
-        O: FnMut() -> F,
-        F: Future<Output = Result<T, E>>,
-    {
-        let warn_threshold = 3;
-        let max_retries = 10;
-        backoff::retry(
-            op,
-            |_e| false,
-            warn_threshold,
-            max_retries,
-            "test retry",
-            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
-        )
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
        .await
-    }
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );

-    async fn time_point() -> SystemTime {
-        tokio::time::sleep(WAIT_TIME).await;
-        let ret = SystemTime::now();
-        tokio::time::sleep(WAIT_TIME).await;
-        ret
-    }
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );

-    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
-    }
+    Ok(())
+}
+
+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
+#[tokio::test]
+async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };

    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;
@@ -85,95 +178,83 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    retry(|| {
-        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-        ctx.client.upload(data, len, &path1, None)
-    })
-    .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    let t0_files = list_files(&ctx.client).await?;
-    let t0 = time_point().await;
-    println!("at t0: {t0_files:?}");
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    let old_data = "remote blob data2";
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

-    retry(|| {
-        let (data, len) = upload_stream(old_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
-    })
-    .await?;
+    ctx.client.delete_objects(&[path1, path2]).await?;

-    let t1_files = list_files(&ctx.client).await?;
-    let t1 = time_point().await;
-    println!("at t1: {t1_files:?}");
+    let prefixes = ctx.client.list_prefixes(None).await?;

-    // A little check to ensure that our clock is not too far off from the S3 clock
-    {
-        let dl = retry(|| ctx.client.download(&path2)).await?;
-        let last_modified = dl.last_modified.unwrap();
-        let half_wt = WAIT_TIME.mul_f32(0.5);
-        let t0_hwt = t0 + half_wt;
-        let t1_hwt = t1 - half_wt;
-        if !(t0_hwt..=t1_hwt).contains(&last_modified) {
-            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
-                This likely means a large lock discrepancy between S3 and the local clock.");
-        }
-    }
+    assert_eq!(prefixes.len(), 1);

-    retry(|| {
-        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        ctx.client.upload(data, len, &path3, None)
-    })
-    .await?;
+    ctx.client.delete_objects(&[path3]).await?;

-    let new_data = "new remote blob data2";
+    Ok(())
+}

-    retry(|| {
-        let (data, len) = upload_stream(new_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
-    })
-    .await?;
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let MaybeEnabledS3::Enabled(ctx) = ctx else {
+        return Ok(());
+    };

-    retry(|| ctx.client.delete(&path1)).await?;
-    let t2_files = list_files(&ctx.client).await?;
-    let t2 = time_point().await;
-    println!("at t2: {t2_files:?}");
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;

-    // No changes after recovery to t2 (no-op)
-    let t_final = time_point().await;
-    ctx.client
-        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
        .await?;
-    let t2_files_recovered = list_files(&ctx.client).await?;
-    println!("after recovery to t2: {t2_files_recovered:?}");
-    assert_eq!(t2_files, t2_files_recovered);
-    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
-    assert_eq!(path2_recovered_t2, new_data.as_bytes());
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);

-    // after recovery to t1: path1 is back, path2 has the old content
-    let t_final = time_point().await;
-    ctx.client
-        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
        .await?;
-    let t1_files_recovered = list_files(&ctx.client).await?;
-    println!("after recovery to t1: {t1_files_recovered:?}");
-    assert_eq!(t1_files, t1_files_recovered);
-    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
-    assert_eq!(path2_recovered_t1, old_data.as_bytes());
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);

-    // after recovery to t0: everything is gone except for path1
-    let t_final = time_point().await;
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
-        .time_travel_recover(None, t0, t_final, CancellationToken::new())
-        .await?;
-    let t0_files_recovered = list_files(&ctx.client).await?;
-    println!("after recovery to t0: {t0_files_recovered:?}");
-    assert_eq!(t0_files, t0_files_recovered);
-
-    // cleanup
-
-    let paths = &[path1, path2, path3];
-    retry(|| ctx.client.delete_objects(paths)).await?;
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;

    Ok(())
 }
@@ -196,13 +277,13 @@ impl EnabledS3 {
    }
 }

-enum MaybeEnabledStorage {
+enum MaybeEnabledS3 {
    Enabled(EnabledS3),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorage {
+impl AsyncTestContext for MaybeEnabledS3 {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -218,7 +299,7 @@ impl AsyncTestContext for MaybeEnabledStorage {
    }
 }

-enum MaybeEnabledStorageWithTestBlobs {
+enum MaybeEnabledS3WithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
@@ -231,7 +312,7 @@ struct S3WithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
+impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -282,7 +363,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledStorageWithSimpleTestBlobs {
+enum MaybeEnabledS3WithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
@@ -293,7 +374,7 @@ struct S3WithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
    #[serde(default)]
    pub http_connstr: Option<String>,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineCopyRequest {
-    pub target_timeline_id: TimelineId,
-    pub until_lsn: Lsn,
-}
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -15,10 +15,6 @@ use tracing::*;
 /// specified time (in milliseconds). The main difference is that we use async
 /// tokio sleep function. Another difference is that we print lines to the log,
 /// which can be useful in tests to check that the failpoint was hit.
-///
-/// Optionally pass a cancellation token, and this failpoint will drop out of
-/// its sleep when the cancellation token fires.  This is useful for testing
-/// cases where we would like to block something, but test its clean shutdown behavior.
 #[macro_export]
 macro_rules! __failpoint_sleep_millis_async {
    ($name:literal) => {{
@@ -34,24 +30,6 @@ macro_rules! __failpoint_sleep_millis_async {
            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
        }
    }};
-    ($name:literal, $cancel:expr) => {{
-        // If the failpoint is used with a "return" action, set should_sleep to the
-        // returned value (as string). Otherwise it's set to None.
-        let should_sleep = (|| {
-            ::fail::fail_point!($name, |x| x);
-            ::std::option::Option::None
-        })();
-
-        // Sleep if the action was a returned value
-        if let ::std::option::Option::Some(duration_str) = should_sleep {
-            $crate::failpoint_support::failpoint_sleep_cancellable_helper(
-                $name,
-                duration_str,
-                $cancel,
-            )
-            .await
-        }
-    }};
 }
 pub use __failpoint_sleep_millis_async as sleep_millis_async;

@@ -67,22 +45,6 @@ pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
    tracing::info!("failpoint {:?}: sleep done", name);
 }

-// Helper function used by the macro. (A function has nicer scoping so we
-// don't need to decorate everything with "::")
-#[doc(hidden)]
-pub async fn failpoint_sleep_cancellable_helper(
-    name: &'static str,
-    duration_str: String,
-    cancel: &CancellationToken,
-) {
-    let millis = duration_str.parse::<u64>().unwrap();
-    let d = std::time::Duration::from_millis(millis);
-
-    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-    tokio::time::timeout(d, cancel.cancelled()).await.ok();
-    tracing::info!("failpoint {:?}: sleep done", name);
-}
-
 pub fn init() -> fail::FailScenario<'static> {
    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
    // We want non-default behavior for `exit`, though, so, we handle it separately.
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -131,9 +131,7 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
-        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
-        _ => info!("Error processing HTTP request: {api_error:#}"),
+        _ => error!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -85,8 +85,6 @@ pub mod sync;

 pub mod failpoint_support;

-pub mod yielding_loop;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
 pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
    let bits = fcntl(fd, F_GETFL)?;

-    // If F_GETFL returns some unknown bits, they should be valid
+    // Safety: If F_GETFL returns some unknown bits, they should be valid
    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
    // would effectively clear them, which is not what we want.
-    let mut flags = OFlag::from_bits_retain(bits);
+    let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
    flags |= OFlag::O_NONBLOCK;

    fcntl(fd, F_SETFL(flags))?;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,12 +15,6 @@ pub struct Gate {
    name: String,
 }

-impl std::fmt::Debug for Gate {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
-    }
-}
-
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
--- a/libs/utils/src/tcp_listener.rs
+++ b/libs/utils/src/tcp_listener.rs
@@ -1,6 +1,7 @@
 use std::{
    io,
    net::{TcpListener, ToSocketAddrs},
+    os::unix::prelude::AsRawFd,
 };

 use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
@@ -9,7 +10,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
 pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
    let listener = TcpListener::bind(addr)?;

-    setsockopt(&listener, ReuseAddr, &true)?;
+    setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;

    Ok(listener)
 }
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -1,35 +0,0 @@
-use tokio_util::sync::CancellationToken;
-
-#[derive(thiserror::Error, Debug)]
-pub enum YieldingLoopError {
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
-/// yields to avoid blocking the executor, and after resuming checks the provided
-/// cancellation token to drop out promptly on shutdown.
-#[inline(always)]
-pub async fn yielding_loop<I, T, F>(
-    interval: usize,
-    cancel: &CancellationToken,
-    iter: I,
-    mut visitor: F,
-) -> Result<(), YieldingLoopError>
-where
-    I: Iterator<Item = T>,
-    F: FnMut(T),
-{
-    for (i, item) in iter.enumerate() {
-        visitor(item);
-
-        if i + 1 % interval == 0 {
-            tokio::task::yield_now().await;
-            if cancel.is_cancelled() {
-                return Err(YieldingLoopError::Cancelled);
-            }
-        }
-    }
-
-    Ok(())
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -446,11 +446,12 @@ impl Runner {
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            // *Ideally* we'd like to log here that we're ignoring the fact the
-                            // memory stats are too high, but in practice this can result in
-                            // spamming the logs with repetitive messages about ignoring the signal
-                            //
-                            // See https://github.com/neondatabase/neon/issues/5865 for more.
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                            continue;
                        }
                    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -61,7 +61,6 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,5 +1,5 @@
 use pageserver_api::{models::*, shard::TenantShardId};
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
@@ -22,18 +22,20 @@ pub enum Error {
    #[error("receive error body: {0}")]
    ReceiveErrorBody(String),

-    #[error("pageserver API: {1}")]
-    ApiError(StatusCode, String),
+    #[error("pageserver API: {0}")]
+    ApiError(String),
 }

 pub type Result<T> = std::result::Result<T, Error>;

+#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+    async fn error_from_body(self) -> Result<Self>;
 }

+#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(self) -> Result<Self> {
+    async fn error_from_body(mut self) -> Result<Self> {
        let status = self.status();
        if !(status.is_client_error() || status.is_server_error()) {
            return Ok(self);
@@ -41,7 +43,7 @@ impl ResponseErrorMessageExt for reqwest::Response {

        let url = self.url().to_owned();
        Err(match self.json::<HttpErrorBody>().await {
-            Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
            Err(_) => {
                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
            }
@@ -49,11 +51,6 @@ impl ResponseErrorMessageExt for reqwest::Response {
    }
 }

-pub enum ForceAwaitLogicalSize {
-    Yes,
-    No,
-}
-
 impl Client {
    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
        Self {
@@ -100,18 +97,11 @@ impl Client {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );
-
-        let uri = match force_await_logical_size {
-            ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
-            ForceAwaitLogicalSize::No => uri,
-        };
-
        self.get(&uri)
            .await?
            .json()
@@ -177,15 +167,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{}/secondary/download",
-            self.mgmt_api_endpoint, tenant_id
-        );
-        self.request(Method::POST, &uri, ()).await?;
-        Ok(())
-    }
-
    pub async fn location_config(
        &self,
        tenant_shard_id: TenantShardId,
@@ -209,23 +190,14 @@ impl Client {
        Ok(())
    }

-    pub async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
-        let path = format!("{}/v1/location_config", self.mgmt_api_endpoint);
-        self.request(Method::GET, &path, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn timeline_create(
        &self,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        req: &TimelineCreateRequest,
    ) -> Result<TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_shard_id
+            self.mgmt_api_endpoint, tenant_id
        );
        self.request(Method::POST, &uri, req)
            .await?
@@ -234,33 +206,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{}/reset",
-            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        self.request(Method::POST, &uri, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn timeline_list(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Result<Vec<TimelineInfo>> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_shard_id
-        );
-        self.get(&uri)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn tenant_synthetic_size(
        &self,
        tenant_shard_id: TenantShardId,
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -108,32 +108,9 @@ pub struct RelTagBlockNo {
 }

 impl PagestreamClient {
-    pub async fn shutdown(self) {
-        let Self {
-            copy_both,
-            cancel_on_client_drop: cancel_conn_task,
-            conn_task,
-        } = self;
-        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
-        // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
-        // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
-        //
-        // If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
-        // the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
-        //
-        // Further, the pageserver makes a lot of noise when it receives CopyFail.
-        // Computes don't send it in practice, they just hard-close the connection.
-        //
-        // So, let's behave like the computes and suppress the CopyFail as follows:
-        // kill the socket first, then drop copy_both.
-        //
-        // See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
-        //
-        // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
-        // => https://github.com/neondatabase/neon/issues/6390
-        let _ = cancel_conn_task.unwrap();
-        conn_task.await.unwrap();
-        drop(copy_both);
+    pub async fn shutdown(mut self) {
+        let _ = self.cancel_on_client_drop.take();
+        self.conn_task.await.unwrap();
    }

    pub async fn getpage(
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
-use pageserver::virtual_file::{self, VirtualFile};
+use pageserver::virtual_file::VirtualFile;

 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(10);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(10);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
+    virtual_file::init(10);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-camino.workspace = true
 clap.workspace = true
 futures.workspace = true
 hdrhistogram.workspace = true
@@ -19,8 +18,8 @@ serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 tokio.workspace = true
-tokio-util.workspace = true

+pageserver = { path = ".." }
 pageserver_client.workspace = true
 pageserver_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,5 +1,4 @@
 use anyhow::Context;
-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;

 use utils::id::TenantTimelineId;
@@ -93,12 +92,10 @@ async fn main_impl(
    for timeline in &timelines {
        js.spawn({
            let timeline = *timeline;
+            // FIXME: this triggers initial logical size calculation
+            // https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(
-                    timeline.tenant_id,
-                    timeline.timeline_id,
-                    ForceAwaitLogicalSize::No,
-                )
+                .timeline_info(timeline.tenant_id, timeline.timeline_id)
                .await
                .unwrap();
            async move {
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,11 +1,10 @@
 use anyhow::Context;
-use camino::Utf8PathBuf;
 use futures::future::join_all;
-use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
-use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver::pgdatadir_mapping::key_to_rel_block;
+use pageserver::repository;
+use pageserver_api::key::is_rel_block_key;
 use pageserver_api::models::PagestreamGetPageRequest;

-use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

@@ -14,7 +13,7 @@ use tokio::sync::Barrier;
 use tokio::task::JoinSet;
 use tracing::{info, instrument};

-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -45,12 +44,6 @@ pub(crate) struct Args {
    req_latest_probability: f64,
    #[clap(long)]
    limit_to_first_n_targets: Option<usize>,
-    /// For large pageserver installations, enumerating the keyspace takes a lot of time.
-    /// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
-    /// The cache is tagged and auto-invalided by the tenant/timeline ids only.
-    /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
-    #[clap(long)]
-    keyspace_cache: Option<Utf8PathBuf>,
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -65,7 +58,7 @@ impl LiveStats {
    }
 }

-#[derive(Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Clone)]
 struct KeyRange {
    timeline: TenantTimelineId,
    timeline_lsn: Lsn,
@@ -113,107 +106,59 @@ async fn main_impl(
    )
    .await?;

-    #[derive(serde::Deserialize)]
-    struct KeyspaceCacheDe {
-        tag: Vec<TenantTimelineId>,
-        data: Vec<KeyRange>,
-    }
-    #[derive(serde::Serialize)]
-    struct KeyspaceCacheSer<'a> {
-        tag: &'a [TenantTimelineId],
-        data: &'a [KeyRange],
-    }
-    let cache = args
-        .keyspace_cache
-        .as_ref()
-        .map(|keyspace_cache_file| {
-            let contents = match std::fs::read(keyspace_cache_file) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    return anyhow::Ok(None);
-                }
-                x => x.context("read keyspace cache file")?,
-            };
-            let cache: KeyspaceCacheDe =
-                serde_json::from_slice(&contents).context("deserialize cache file")?;
-            let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
-                == HashSet::from_iter(timelines.iter().cloned());
-            info!("keyspace cache file matches tag: {tag_ok}");
-            anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
-        })
-        .transpose()?
-        .flatten();
-    let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
-        info!("using keyspace cache file");
-        cached
-    } else {
-        let mut js = JoinSet::new();
-        for timeline in &timelines {
-            js.spawn({
-                let mgmt_api_client = Arc::clone(&mgmt_api_client);
-                let timeline = *timeline;
-                async move {
-                    let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
-                        .await?;
-                    let lsn = partitioning.at_lsn;
-                    let start = Instant::now();
-                    let mut filtered = KeySpaceAccum::new();
-                    // let's hope this is inlined and vectorized...
-                    // TODO: turn this loop into a is_rel_block_range() function.
-                    for r in partitioning.keys.ranges.iter() {
-                        let mut i = r.start;
-                        while i != r.end {
-                            if is_rel_block_key(&i) {
-                                filtered.add_key(i);
-                            }
-                            i = i.next();
-                        }
-                    }
-                    let filtered = filtered.to_keyspace();
-                    let filter_duration = start.elapsed();
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(&mgmt_api_client);
+            let timeline = *timeline;
+            async move {
+                let partitioning = mgmt_api_client
+                    .keyspace(timeline.tenant_id, timeline.timeline_id)
+                    .await?;
+                let lsn = partitioning.at_lsn;

-                    anyhow::Ok((
-                        filter_duration,
-                        filtered.ranges.into_iter().map(move |r| KeyRange {
-                            timeline,
-                            timeline_lsn: lsn,
-                            start: r.start.to_i128(),
-                            end: r.end.to_i128(),
-                        }),
-                    ))
-                }
-            });
-        }
-        let mut total_filter_duration = Duration::from_secs(0);
-        let mut all_ranges: Vec<KeyRange> = Vec::new();
-        while let Some(res) = js.join_next().await {
-            let (filter_duration, range) = res.unwrap().unwrap();
-            all_ranges.extend(range);
-            total_filter_duration += filter_duration;
-        }
-        info!("filter duration: {}", total_filter_duration.as_secs_f64());
-        if let Some(cachefile) = args.keyspace_cache.as_ref() {
-            let cache = KeyspaceCacheSer {
-                tag: &timelines,
-                data: &all_ranges,
-            };
-            let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
-            std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
-            info!("successfully wrote keyspace cache file");
-        }
-        all_ranges
-    };
+                let ranges = partitioning
+                    .keys
+                    .ranges
+                    .iter()
+                    .filter_map(|r| {
+                        let start = r.start;
+                        let end = r.end;
+                        // filter out non-relblock keys
+                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
+                            (true, true) => Some(KeyRange {
+                                timeline,
+                                timeline_lsn: lsn,
+                                start: start.to_i128(),
+                                end: end.to_i128(),
+                            }),
+                            (true, false) | (false, true) => {
+                                unimplemented!("split up range")
+                            }
+                            (false, false) => None,
+                        }
+                    })
+                    .collect::<Vec<_>>();
+
+                anyhow::Ok(ranges)
+            }
+        });
+    }
+    let mut all_ranges: Vec<KeyRange> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_ranges.extend(res.unwrap().unwrap());
+    }

    let live_stats = Arc::new(LiveStats::default());

    let num_client_tasks = timelines.len();
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = 1;
-    let num_main_impl = 1;

    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));

    tokio::spawn({
        let stats = Arc::clone(&live_stats);
@@ -233,143 +178,125 @@ async fn main_impl(
        }
    });

-    let cancel = CancellationToken::new();
-
-    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
+    let mut work_senders = HashMap::new();
    let mut tasks = Vec::new();
    for tl in &timelines {
        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(*tl, sender);
+        work_senders.insert(tl, sender);
        tasks.push(tokio::spawn(client(
            args,
            *tl,
            Arc::clone(&start_work_barrier),
            receiver,
+            Arc::clone(&all_work_done_barrier),
            Arc::clone(&live_stats),
-            cancel.clone(),
        )));
    }

-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
-        let start_work_barrier = start_work_barrier.clone();
-        let cancel = cancel.clone();
-        match args.per_target_rate_limit {
-            None => Box::pin(async move {
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
+        None => Box::pin(async move {
+            let weights = rand::distributions::weighted::WeightedIndex::new(
+                all_ranges.iter().map(|v| v.len()),
+            )
+            .unwrap();
+
+            start_work_barrier.wait().await;
+
+            loop {
+                let (timeline, req) = {
+                    let mut rng = rand::thread_rng();
+                    let r = &all_ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = repository::Key::from_i128(key);
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    (
+                        r.timeline,
+                        PagestreamGetPageRequest {
+                            latest: rng.gen_bool(args.req_latest_probability),
+                            lsn: r.timeline_lsn,
+                            rel: rel_tag,
+                            blkno: block_no,
+                        },
+                    )
+                };
+                let sender = work_senders.get(&timeline).unwrap();
+                // TODO: what if this blocks?
+                sender.send(req).await.ok().unwrap();
+            }
+        }),
+        Some(rps_limit) => Box::pin(async move {
+            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+
+            let make_timeline_task: &dyn Fn(
+                TenantTimelineId,
+            )
+                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                let sender = work_senders.get(&timeline).unwrap();
+                let ranges: Vec<KeyRange> = all_ranges
+                    .iter()
+                    .filter(|r| r.timeline == timeline)
+                    .cloned()
+                    .collect();
                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    all_ranges.iter().map(|v| v.len()),
+                    ranges.iter().map(|v| v.len()),
                )
                .unwrap();

-                start_work_barrier.wait().await;
-
-                while !cancel.is_cancelled() {
-                    let (timeline, req) = {
-                        let mut rng = rand::thread_rng();
-                        let r = &all_ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        let (rel_tag, block_no) =
-                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                        (
-                            r.timeline,
+                Box::pin(async move {
+                    let mut ticker = tokio::time::interval(period);
+                    ticker.set_missed_tick_behavior(
+                        /* TODO review this choice */
+                        tokio::time::MissedTickBehavior::Burst,
+                    );
+                    loop {
+                        ticker.tick().await;
+                        let req = {
+                            let mut rng = rand::thread_rng();
+                            let r = &ranges[weights.sample(&mut rng)];
+                            let key: i128 = rng.gen_range(r.start..r.end);
+                            let key = repository::Key::from_i128(key);
+                            let (rel_tag, block_no) = key_to_rel_block(key)
+                                .expect("we filter non-rel-block keys out above");
                            PagestreamGetPageRequest {
                                latest: rng.gen_bool(args.req_latest_probability),
                                lsn: r.timeline_lsn,
                                rel: rel_tag,
                                blkno: block_no,
-                            },
-                        )
-                    };
-                    let sender = work_senders.get(&timeline).unwrap();
-                    // TODO: what if this blocks?
-                    if sender.send(req).await.is_err() {
-                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
-                    }
-                }
-            }),
-            Some(rps_limit) => Box::pin(async move {
-                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_timeline_task: &dyn Fn(
-                    TenantTimelineId,
-                )
-                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                    let sender = work_senders.get(&timeline).unwrap();
-                    let ranges: Vec<KeyRange> = all_ranges
-                        .iter()
-                        .filter(|r| r.timeline == timeline)
-                        .cloned()
-                        .collect();
-                    let weights = rand::distributions::weighted::WeightedIndex::new(
-                        ranges.iter().map(|v| v.len()),
-                    )
-                    .unwrap();
-
-                    let cancel = cancel.clone();
-                    Box::pin(async move {
-                        let mut ticker = tokio::time::interval(period);
-                        ticker.set_missed_tick_behavior(
-                            /* TODO review this choice */
-                            tokio::time::MissedTickBehavior::Burst,
-                        );
-                        while !cancel.is_cancelled() {
-                            ticker.tick().await;
-                            let req = {
-                                let mut rng = rand::thread_rng();
-                                let r = &ranges[weights.sample(&mut rng)];
-                                let key: i128 = rng.gen_range(r.start..r.end);
-                                let key = Key::from_i128(key);
-                                assert!(is_rel_block_key(&key));
-                                let (rel_tag, block_no) = key_to_rel_block(key)
-                                    .expect("we filter non-rel-block keys out above");
-                                PagestreamGetPageRequest {
-                                    latest: rng.gen_bool(args.req_latest_probability),
-                                    lsn: r.timeline_lsn,
-                                    rel: rel_tag,
-                                    blkno: block_no,
-                                }
-                            };
-                            if sender.send(req).await.is_err() {
-                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                            }
-                        }
-                    })
-                };
+                        };
+                        sender.send(req).await.ok().unwrap();
+                    }
+                })
+            };

-                let tasks: Vec<_> = work_senders
-                    .keys()
-                    .map(|tl| make_timeline_task(*tl))
-                    .collect();
+            let tasks: Vec<_> = work_senders
+                .keys()
+                .map(|tl| make_timeline_task(**tl))
+                .collect();

-                start_work_barrier.wait().await;
+            start_work_barrier.wait().await;

-                join_all(tasks).await;
-            }),
-        }
+            join_all(tasks).await;
+        }),
    };

-    let work_sender_task = tokio::spawn(work_sender);
-
-    info!("waiting for everything to become ready");
-    start_work_barrier.wait().await;
-    info!("work started");
    if let Some(runtime) = args.runtime {
-        tokio::time::sleep(runtime.into()).await;
-        info!("runtime over, signalling cancellation");
-        cancel.cancel();
-        work_sender_task.await.unwrap();
-        info!("work sender exited");
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
    } else {
-        work_sender_task.await.unwrap();
+        work_sender.await;
        unreachable!("work sender never terminates");
    }

-    info!("joining clients");
    for t in tasks {
        t.await.unwrap();
    }

-    info!("all clients stopped");
-
    let output = Output {
        total: {
            let mut agg_stats = request_stats::Stats::new();
@@ -393,9 +320,11 @@ async fn client(
    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
+    all_work_done_barrier: Arc<Barrier>,
    live_stats: Arc<LiveStats>,
-    cancel: CancellationToken,
 ) {
+    start_work_barrier.wait().await;
+
    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
        .await
        .unwrap();
@@ -404,27 +333,19 @@ async fn client(
        .await
        .unwrap();

-    let do_requests = async {
-        start_work_barrier.wait().await;
-        while let Some(req) = work.recv().await {
-            let start = Instant::now();
-            client
-                .getpage(req)
-                .await
-                .with_context(|| format!("getpage for {timeline}"))
-                .unwrap();
-            let elapsed = start.elapsed();
-            live_stats.inc();
-            STATS.with(|stats| {
-                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-            });
-        }
-    };
-    tokio::select! {
-        res = do_requests => { res },
-        _ = cancel.cancelled() => {
-            // fallthrough to shutdown
-        }
+    while let Some(req) = work.recv().await {
+        let start = Instant::now();
+        client
+            .getpage(req)
+            .await
+            .with_context(|| format!("getpage for {timeline}"))
+            .unwrap();
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
    }
-    client.shutdown().await;
+
+    all_work_done_barrier.wait().await;
 }
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -4,8 +4,6 @@ use humantime::Duration;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

-use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
-
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
@@ -58,15 +56,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    for tl in timelines {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
+            // TODO: API to explicitly trigger initial logical size computation.
+            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
+            // => https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(tl.tenant_id, tl.timeline_id)
                .await
                .unwrap();

-            // Polling should not be strictly required here since we await
-            // for the initial logical size, however it's possible for the request
-            // to land before the timeline is initialised. This results in an approximate
-            // logical size.
            if let Some(period) = args.poll_for_completion {
                let mut ticker = tokio::time::interval(period.into());
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
@@ -74,7 +71,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(tl.tenant_id, tl.timeline_id)
                        .await
                        .unwrap();
                }
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -35,7 +35,6 @@ fn main() {
        logging::Output::Stderr,
    )
    .unwrap();
-    logging::replace_panic_hook_with_tracing_panic_hook().forget();

    let args = Args::parse();
    match args {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -11,9 +11,8 @@
 //! from data stored in object storage.
 //!
 use anyhow::{anyhow, bail, ensure, Context};
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -134,87 +133,6 @@ where
    ctx: &'a RequestContext,
 }

-/// A sink that accepts SLRU blocks ordered by key and forwards
-/// full segments to the archive.
-struct SlruSegmentsBuilder<'a, 'b, W>
-where
-    W: AsyncWrite + Send + Sync + Unpin,
-{
-    ar: &'a mut Builder<&'b mut W>,
-    buf: Vec<u8>,
-    current_segment: Option<(SlruKind, u32)>,
-}
-
-impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
-where
-    W: AsyncWrite + Send + Sync + Unpin,
-{
-    fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
-        Self {
-            ar,
-            buf: Vec::new(),
-            current_segment: None,
-        }
-    }
-
-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
-        let (kind, segno, _) = key_to_slru_block(*key)?;
-
-        match kind {
-            SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
-            }
-            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
-            }
-        }
-
-        let segment = (kind, segno);
-        match self.current_segment {
-            None => {
-                self.current_segment = Some(segment);
-                self.buf
-                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
-            }
-            Some(current_seg) if current_seg == segment => {
-                self.buf
-                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
-            }
-            Some(_) => {
-                self.flush().await?;
-
-                self.current_segment = Some(segment);
-                self.buf
-                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
-            }
-        }
-
-        Ok(())
-    }
-
-    async fn flush(&mut self) -> anyhow::Result<()> {
-        let nblocks = self.buf.len() / BLCKSZ as usize;
-        let (kind, segno) = self.current_segment.take().unwrap();
-        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
-        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
-
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
-
-        self.buf.clear();
-
-        Ok(())
-    }
-
-    async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
-
-        self.flush().await
-    }
-}
-
 impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
@@ -250,27 +168,20 @@ where
        }

        // Gather non-relational files from object storage pages.
-        let slru_partitions = self
-            .timeline
-            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-            .await?
-            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
-
-        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
-
-        for part in slru_partitions.parts {
-            let blocks = self
+        for kind in [
+            SlruKind::Clog,
+            SlruKind::MultiXactOffsets,
+            SlruKind::MultiXactMembers,
+        ] {
+            for segno in self
                .timeline
-                .get_vectored(&part.ranges, self.lsn, self.ctx)
-                .await?;
-
-            for (key, block) in blocks {
-                slru_builder.add_block(&key, block?).await?;
+                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
+                .await?
+            {
+                self.add_slru_segment(kind, segno).await?;
            }
        }

-        slru_builder.finish().await?;
-
        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
@@ -394,6 +305,39 @@ where
        Ok(())
    }

+    //
+    // Generate SLRU segment files from repository.
+    //
+    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
+        let nblocks = self
+            .timeline
+            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
+            .await?;
+
+        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
+        for blknum in 0..nblocks {
+            let img = self
+                .timeline
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
+                .await?;
+
+            if slru == SlruKind::Clog {
+                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
+            } else {
+                ensure!(img.len() == BLCKSZ as usize);
+            }
+
+            slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
+        }
+
+        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
+        let header = new_tar_header(&segname, slru_buf.len() as u64)?;
+        self.ar.append(&header, slru_buf.as_slice()).await?;
+
+        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        Ok(())
+    }
+
    //
    // Include database/tablespace directories.
    //
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
+    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
@@ -527,7 +527,6 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,16 +36,13 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
-use crate::virtual_file;
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
-    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;

-use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
-
 pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;
@@ -78,12 +75,9 @@ pub mod defaults {
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

-    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
-
    ///
    /// Default built-in configuration file.
    ///
@@ -119,8 +113,6 @@ pub mod defaults {

 #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}

-#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -138,7 +130,6 @@ pub mod defaults {
 #gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
-#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

 [remote_storage]

@@ -248,14 +239,8 @@ pub struct PageServerConf {
    /// heatmap uploads vs. other remote storage operations.
    pub heatmap_upload_concurrency: usize,

-    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
-    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
-    pub secondary_download_concurrency: usize,
-
    /// Maximum number of WAL records to be ingested and committed at the same time
    pub ingest_batch_size: u64,
-
-    pub virtual_file_io_engine: virtual_file::IoEngineKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -337,11 +322,8 @@ struct PageServerConfigBuilder {
    control_plane_emergency_mode: BuilderValue<bool>,

    heatmap_upload_concurrency: BuilderValue<usize>,
-    secondary_download_concurrency: BuilderValue<usize>,

    ingest_batch_size: BuilderValue<u64>,
-
-    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 }

 impl Default for PageServerConfigBuilder {
@@ -414,11 +396,8 @@ impl Default for PageServerConfigBuilder {
            control_plane_emergency_mode: Set(false),

            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),

            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
-
-            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
        }
    }
 }
@@ -567,18 +546,10 @@ impl PageServerConfigBuilder {
        self.heatmap_upload_concurrency = BuilderValue::Set(value)
    }

-    pub fn secondary_download_concurrency(&mut self, value: usize) {
-        self.secondary_download_concurrency = BuilderValue::Set(value)
-    }
-
    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
    }

-    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
-        self.virtual_file_io_engine = BuilderValue::Set(value);
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -680,15 +651,9 @@ impl PageServerConfigBuilder {
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            secondary_download_concurrency: self
-                .secondary_download_concurrency
-                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
            ingest_batch_size: self
                .ingest_batch_size
                .ok_or(anyhow!("missing ingest_batch_size"))?,
-            virtual_file_io_engine: self
-                .virtual_file_io_engine
-                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
        })
    }
 }
@@ -746,11 +711,6 @@ impl PageServerConf {
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

-    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(TENANT_HEATMAP_BASENAME)
-    }
-
    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TIMELINES_SEGMENT_NAME)
@@ -936,13 +896,7 @@ impl PageServerConf {
                "heatmap_upload_concurrency" => {
                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
-                "secondary_download_concurrency" => {
-                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
-                },
                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
-                "virtual_file_io_engine" => {
-                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1014,9 +968,7 @@ impl PageServerConf {
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
        }
    }
 }
@@ -1150,12 +1102,11 @@ mod tests {
    };

    use camino_tempfile::{tempdir, Utf8TempDir};
-    use pageserver_api::models::EvictionPolicy;
    use remote_storage::{RemoteStorageKind, S3Config};
    use utils::serde_percent::Percent;

    use super::*;
-    use crate::DEFAULT_PG_VERSION;
+    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};

    const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
@@ -1247,9 +1198,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1311,9 +1260,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: 100,
-                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_shard_id, tenant_state, _gen) in tenants {
+        for (tenant_shard_id, tenant_state) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics(
        }
    };

-    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,6 +1,5 @@
 use std::collections::HashMap;

-use futures::Future;
 use pageserver_api::{
    control_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -29,14 +28,13 @@ pub enum RetryForeverError {
    ShuttingDown,
 }

+#[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    fn re_attach(
-        &self,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
-    fn validate(
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
+    async fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
-    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
 }

 impl ControlPlaneClient {
@@ -125,6 +123,7 @@ impl ControlPlaneClient {
    }
 }

+#[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,6 +831,7 @@ mod test {
        }
    }

+    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -47,24 +47,21 @@ use std::{
 };

 use anyhow::Context;
-use pageserver_api::shard::TenantShardId;
+use camino::Utf8Path;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;
-use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
-        mgr::TenantManager,
-        remote_timeline_client::LayerFileMetadata,
-        secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer},
        Timeline,
    },
 };
@@ -128,7 +125,6 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
-    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -154,7 +150,8 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
+            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
+                .await;
            Ok(())
        },
    );
@@ -167,7 +164,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenant_manager: Arc<TenantManager>,
+    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -194,7 +191,7 @@ async fn disk_usage_eviction_task(
                state,
                task_config,
                storage,
-                &tenant_manager,
+                tenants_dir,
                &cancel,
            )
            .await;
@@ -229,17 +226,15 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenant_manager: &Arc<TenantManager>,
+    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let tenants_dir = tenant_manager.get_conf().tenants_path();
-    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
+    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
    let res = disk_usage_eviction_task_iteration_impl(
        state,
        storage,
        usage_pre,
-        tenant_manager,
        task_config.eviction_order,
        cancel,
    )
@@ -253,7 +248,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -329,7 +324,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
-    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -350,29 +344,29 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates =
-        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
-            EvictionCandidates::Cancelled => {
-                return Ok(IterationOutcome::Cancelled);
-            }
-            EvictionCandidates::Finished(partitioned) => partitioned,
-        };
+    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
+        EvictionCandidates::Cancelled => {
+            return Ok(IterationOutcome::Cancelled);
+        }
+        EvictionCandidates::Finished(partitioned) => partitioned,
+    };

    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        let nth = i + 1;
+        let desc = candidate.layer.layer_desc();
        let total_candidates = candidates.len();
-        let size = candidate.layer.get_file_size();
+        let size = desc.file_size;
        let rel = candidate.relative_last_activity;
        debug!(
            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
-            candidate.layer.get_tenant_shard_id(),
-            candidate.layer.get_timeline_id(),
-            candidate.layer.get_name(),
+            desc.tenant_shard_id,
+            desc.timeline_id,
+            candidate.layer,
        );
    }

@@ -386,56 +380,39 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
+    let mut warned = None;
+    let mut usage_planned = usage_pre;
+    let mut evicted_amount = 0;

-    let selection = select_victims(&candidates, usage_pre);
-
-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        if !usage_planned.has_pressure() {
+            debug!(
+                no_candidates_evicted = i,
+                "took enough candidates for pressure to be relieved"
+            );
+            break;
        }

-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
+        if partition == &MinResidentSizePartition::Below && warned.is_none() {
+            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
+            warned = Some(usage_planned);
        }

-        selection
-    } else {
-        selection
+        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
+        evicted_amount += 1;
+    }
+
+    let usage_planned = match warned {
+        Some(respecting_tenant_min_resident_size) => PlannedUsage {
+            respecting_tenant_min_resident_size,
+            fallback_to_global_lru: Some(usage_planned),
+        },
+        None => PlannedUsage {
+            respecting_tenant_min_resident_size: usage_planned,
+            fallback_to_global_lru: None,
+        },
    };
-
-    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
+    debug!(?usage_planned, "usage planned");

    // phase2: evict layers

@@ -486,30 +463,19 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                continue;
            };

-            match candidate.layer {
-                EvictionLayer::Attached(layer) => {
-                    let file_size = layer.layer_desc().file_size;
-                    js.spawn(async move {
-                        layer
-                            .evict_and_wait()
-                            .await
-                            .map(|()| file_size)
-                            .map_err(|e| (file_size, e))
-                    });
-                }
-                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
-                    let tenant_manager = tenant_manager.clone();
+            js.spawn(async move {
+                let rtc = candidate.timeline.remote_client.as_ref().expect(
+                    "holding the witness, all timelines must have a remote timeline client",
+                );
+                let file_size = candidate.layer.layer_desc().file_size;
+                candidate
+                    .layer
+                    .evict_and_wait(rtc)
+                    .await
+                    .map(|()| file_size)
+                    .map_err(|e| (file_size, e))
+            });

-                    js.spawn(async move {
-                        layer
-                            .secondary_tenant
-                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
-                            .await;
-                        Ok(file_size)
-                    });
-                }
-            }
            tokio::task::yield_now().await;
        }

@@ -536,100 +502,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 }

 #[derive(Clone)]
-pub(crate) struct EvictionSecondaryLayer {
-    pub(crate) secondary_tenant: Arc<SecondaryTenant>,
-    pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerFileName,
-    pub(crate) metadata: LayerFileMetadata,
-}
-
-/// Full [`Layer`] objects are specific to tenants in attached mode.  This type is a layer
-/// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
-#[derive(Clone)]
-pub(crate) enum EvictionLayer {
-    Attached(Layer),
-    #[allow(dead_code)]
-    Secondary(EvictionSecondaryLayer),
-}
-
-impl From<Layer> for EvictionLayer {
-    fn from(value: Layer) -> Self {
-        Self::Attached(value)
-    }
-}
-
-impl EvictionLayer {
-    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
-        match self {
-            Self::Attached(l) => &l.layer_desc().tenant_shard_id,
-            Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
-        }
-    }
-
-    pub(crate) fn get_timeline_id(&self) -> &TimelineId {
-        match self {
-            Self::Attached(l) => &l.layer_desc().timeline_id,
-            Self::Secondary(sl) => &sl.timeline_id,
-        }
-    }
-
-    pub(crate) fn get_name(&self) -> LayerFileName {
-        match self {
-            Self::Attached(l) => l.layer_desc().filename(),
-            Self::Secondary(sl) => sl.name.clone(),
-        }
-    }
-
-    pub(crate) fn get_file_size(&self) -> u64 {
-        match self {
-            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
-        }
-    }
-}
-
-#[derive(Clone)]
-pub(crate) struct EvictionCandidate {
-    pub(crate) layer: EvictionLayer,
-    pub(crate) last_activity_ts: SystemTime,
-    pub(crate) relative_last_activity: finite_f32::FiniteF32,
-}
-
-impl std::fmt::Display for EvictionLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        match self {
-            Self::Attached(l) => l.fmt(f),
-            Self::Secondary(sl) => {
-                write!(f, "{}/{}", sl.timeline_id, sl.name)
-            }
-        }
-    }
-}
-
-pub(crate) struct DiskUsageEvictionInfo {
-    /// Timeline's largest layer (remote or resident)
-    pub max_layer_size: Option<u64>,
-    /// Timeline's resident layers
-    pub resident_layers: Vec<EvictionCandidate>,
-}
-
-impl std::fmt::Debug for EvictionCandidate {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
-        // having to allocate a string to this is bad, but it will rarely be formatted
-        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
-        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
-        struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                write!(f, "{}", self.0)
-            }
-        }
-        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
-            .field("layer", &DisplayIsDebug(&self.layer))
-            .field("last_activity", &ts)
-            .finish()
-    }
+struct EvictionCandidate {
+    timeline: Arc<Timeline>,
+    layer: Layer,
+    last_activity_ts: SystemTime,
+    relative_last_activity: finite_f32::FiniteF32,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -746,7 +623,6 @@ enum EvictionCandidates {
 /// - tenant B 1 layer
 /// - tenant C 8 layers
 async fn collect_eviction_candidates(
-    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
@@ -755,16 +631,13 @@ async fn collect_eviction_candidates(
        .await
        .context("get list of tenants")?;

-    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
-    // and the resulting data structure can be huge.
-    // (https://github.com/neondatabase/neon/issues/6224)
    let mut candidates = Vec::new();

-    for (tenant_id, _state, _gen) in tenants {
+    for (tenant_id, _state) in &tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -792,7 +665,11 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
-            tenant_candidates.extend(info.resident_layers.into_iter());
+            tenant_candidates.extend(
+                info.resident_layers
+                    .into_iter()
+                    .map(|layer_infos| (tl.clone(), layer_infos)),
+            );
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

            if cancel.is_cancelled() {
@@ -813,16 +690,14 @@ async fn collect_eviction_candidates(
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                tenant_id=%tenant.tenant_id(),
                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
        } else {
            debug!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                tenant_id=%tenant.tenant_id(),
                max_layer_size,
                "using max layer size as min_resident_size for tenant",
            );
@@ -832,7 +707,7 @@ async fn collect_eviction_candidates(
        // Sort layers most-recently-used first, then partition by
        // cumsum above/below min_resident_size.
        tenant_candidates
-            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
+            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

        // keeping the -1 or not decides if every tenant should lose their least recently accessed
@@ -866,10 +741,12 @@ async fn collect_eviction_candidates(
            .unwrap_or(1);
        let divider = total as f32;

-        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
+        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
+            let file_size = layer_info.file_size();
+
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
+            let relative_last_activity = if matches!(
                eviction_order,
                EvictionOrder::RelativeAccessed { .. }
            ) {
@@ -884,123 +761,41 @@ async fn collect_eviction_candidates(
                finite_f32::FiniteF32::ZERO
            };

+            let candidate = EvictionCandidate {
+                timeline,
+                last_activity_ts: layer_info.last_activity_ts,
+                layer: layer_info.layer,
+                relative_last_activity,
+            };
            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
            } else {
                MinResidentSizePartition::Below
            };
-            cumsum += i128::from(candidate.layer.get_file_size());
            candidates.push((partition, candidate));
+            cumsum += i128::from(file_size);
        }
    }

-    // Note: the same tenant ID might be hit twice, if it transitions from attached to
-    // secondary while we run.  That is okay: when we eventually try and run the eviction,
-    // the `Gate` on the object will ensure that whichever one has already been shut down
-    // will not delete anything.
-
-    let mut secondary_tenants = Vec::new();
-    tenant_manager.foreach_secondary_tenants(
-        |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
-            secondary_tenants.push(state.clone());
-        },
-    );
-
-    for secondary_tenant in secondary_tenants {
-        let mut layer_info = secondary_tenant.get_layers_for_eviction();
-
-        layer_info
-            .resident_layers
-            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
-
-        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
-            (
-                // Secondary locations' layers are always considered above the min resident size,
-                // i.e. secondary locations are permitted to be trimmed to zero layers if all
-                // the layers have sufficiently old access times.
-                MinResidentSizePartition::Above,
-                candidate,
-            )
-        }));
-    }
-
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    match eviction_order {
+        EvictionOrder::AbsoluteAccessed => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.last_activity_ts)
+            });
+        }
+        EvictionOrder::RelativeAccessed { .. } => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            });
+        }
+    }

    Ok(EvictionCandidates::Finished(candidates))
 }

-/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
-/// relieve pressure.
-///
-/// Returns the amount of candidates selected, with the planned usage.
-fn select_victims<U: Usage>(
-    candidates: &[(MinResidentSizePartition, EvictionCandidate)],
-    usage_pre: U,
-) -> VictimSelection<U> {
-    let mut usage_when_switched = None;
-    let mut usage_planned = usage_pre;
-    let mut evicted_amount = 0;
-
-    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        if !usage_planned.has_pressure() {
-            break;
-        }
-
-        if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
-            usage_when_switched = Some((usage_planned, i));
-        }
-
-        usage_planned.add_available_bytes(candidate.layer.get_file_size());
-        evicted_amount += 1;
-    }
-
-    VictimSelection {
-        amount: evicted_amount,
-        usage_pre,
-        usage_when_switched,
-        usage_planned,
-    }
-}
-
-struct VictimSelection<U> {
-    amount: usize,
-    usage_pre: U,
-    usage_when_switched: Option<(U, usize)>,
-    usage_planned: U,
-}
-
-impl<U: Usage> VictimSelection<U> {
-    fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
-        debug!(
-            evicted_amount=%self.amount,
-            "took enough candidates for pressure to be relieved"
-        );
-
-        if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
-            warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
-        }
-
-        let planned = match self.usage_when_switched {
-            Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
-                respecting_tenant_min_resident_size,
-                fallback_to_global_lru: Some(self.usage_planned),
-            },
-            None => PlannedUsage {
-                respecting_tenant_min_resident_size: self.usage_planned,
-                fallback_to_global_lru: None,
-            },
-        };
-
-        (self.amount, planned)
-    }
-}
-
 struct TimelineKey(Arc<Timeline>);

 impl PartialEq for TimelineKey {
@@ -1026,7 +821,7 @@ impl std::ops::Deref for TimelineKey {
 }

 /// A totally ordered f32 subset we can use with sorting functions.
-pub(crate) mod finite_f32 {
+mod finite_f32 {

    /// A totally ordered f32 subset we can use with sorting functions.
    #[derive(Clone, Copy, PartialEq)]
@@ -1085,137 +880,6 @@ pub(crate) mod finite_f32 {
    }
 }

-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
-        }
-    }
-}
-
 mod filesystem_level_usage {
    use anyhow::Context;
    use camino::Utf8Path;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -877,56 +877,6 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

-  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Marks the initdb archive for preservation upon deletion of the timeline or tenant.
-        This is meant to be part of the disaster recovery process.
-      responses:
-        "202":
-          description: Tenant scheduled to load successfully
-        "404":
-          description: No tenant or timeline found for the specified ids
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,14 +14,14 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::LocationConfigListResponse;
-use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
-use pageserver_api::models::TenantState;
+use pageserver_api::models::TenantShardSplitRequest;
+use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
 };
+use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -44,14 +44,12 @@ use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
-use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::SpawnMode;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
@@ -117,6 +115,14 @@ impl State {
            secondary_controller,
        })
    }
+
+    fn tenant_resources(&self) -> TenantSharedResources {
+        TenantSharedResources {
+            broker_client: self.broker_client.clone(),
+            remote_storage: self.remote_storage.clone(),
+            deletion_queue_client: self.deletion_queue_client.clone(),
+        }
+    }
 }

 #[inline(always)]
@@ -149,7 +155,6 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
-            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
@@ -172,7 +177,7 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
-            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
+            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -187,19 +192,6 @@ impl From<TenantSlotUpsertError> for ApiError {
        match e {
            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
            MapState(e) => e.into(),
-            ShuttingDown(_) => ApiError::ShuttingDown,
-        }
-    }
-}
-
-impl From<UpsertLocationError> for ApiError {
-    fn from(e: UpsertLocationError) -> ApiError {
-        use UpsertLocationError::*;
-        match e {
-            BadRequest(e) => ApiError::BadRequest(e),
-            Unavailable(_) => ApiError::ShuttingDown,
-            e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -326,21 +318,11 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
-    force_await_initial_logical_size: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    if force_await_initial_logical_size {
-        timeline.clone().await_initial_logical_size().await
-    }
-
-    let mut info = build_timeline_info_common(
-        timeline,
-        ctx,
-        tenant::timeline::GetLogicalSizePriority::Background,
-    )
-    .await?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -357,7 +339,6 @@ async fn build_timeline_info(
 async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
-    logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let initdb_lsn = timeline.initdb_lsn;
@@ -380,7 +361,8 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
+    let current_logical_size =
+        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -491,15 +473,11 @@ async fn timeline_create_handler(
        .await {
            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Err(_) if tenant.cancel.is_cancelled() => {
-                // In case we get some ugly error type during shutdown, cast it into a clean 503.
-                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
-            }
            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                json_response(StatusCode::CONFLICT, ())
            }
@@ -531,8 +509,6 @@ async fn timeline_list_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    let force_await_initial_logical_size: Option<bool> =
-        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -546,7 +522,6 @@ async fn timeline_list_handler(
            let timeline_info = build_timeline_info(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
-                force_await_initial_logical_size.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -566,43 +541,6 @@ async fn timeline_list_handler(
    json_response(StatusCode::OK, response_data)
 }

-async fn timeline_preserve_initdb_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    // Part of the process for disaster recovery from safekeeper-stored WAL:
-    // If we don't recover into a new timeline but want to keep the timeline ID,
-    // then the initdb archive is deleted. This endpoint copies it to a different
-    // location where timeline recreation cand find it.
-
-    async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
-
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
-
-        timeline
-            .preserve_initdb_archive()
-            .await
-            .context("preserving initdb archive")
-            .map_err(ApiError::InternalServerError)?;
-
-        Ok::<_, ApiError>(())
-    }
-    .instrument(info_span!("timeline_preserve_initdb_archive",
-                tenant_id = %tenant_shard_id.tenant_id,
-                shard_id = %tenant_shard_id.shard_slug(),
-                %timeline_id))
-    .await?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -611,8 +549,6 @@ async fn timeline_detail_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    let force_await_initial_logical_size: Option<bool> =
-        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -628,7 +564,6 @@ async fn timeline_detail_handler(
        let timeline_info = build_timeline_info(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
-            force_await_initial_logical_size.unwrap_or(false),
            &ctx,
        )
        .await
@@ -747,39 +682,16 @@ async fn tenant_attach_handler(
        )));
    }

-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-    let shard_params = ShardParameters::default();
-    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
-
-    let tenant = state
-        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            None,
-            SpawnMode::Normal,
-            &ctx,
-        )
-        .await?;
-
-    let Some(tenant) = tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-
-    // We might have successfully constructed a Tenant, but it could still
-    // end up in a broken state:
-    if let TenantState::Broken {
-        reason,
-        backtrace: _,
-    } = tenant.current_state()
-    {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Tenant state is Broken: {reason}"
-        )));
-    }
+    mgr::attach_tenant(
+        state.conf,
+        tenant_id,
+        generation,
+        tenant_conf,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_attach", %tenant_id))
+    .await?;

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -920,12 +832,11 @@ async fn tenant_list_handler(
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
        .iter()
-        .map(|(id, state, gen)| TenantInfo {
+        .map(|(id, state)| TenantInfo {
            id: *id,
            state: state.clone(),
            current_physical_size: None,
            attachment_status: state.attachment_status(),
-            generation: (*gen).into(),
        })
        .collect::<Vec<TenantInfo>>();

@@ -955,7 +866,6 @@ async fn tenant_status(
                state: state.clone(),
                current_physical_size: Some(current_physical_size),
                attachment_status: state.attachment_status(),
-                generation: tenant.generation().into(),
            },
            timelines: tenant.list_timeline_ids(),
        })
@@ -1080,6 +990,25 @@ async fn tenant_size_handler(
    )
 }

+async fn tenant_shard_split_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let req: TenantShardSplitRequest = json_request(&mut request).await?;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let new_shards = state
+        .tenant_manager
+        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
+}
+
 async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1240,35 +1169,37 @@ async fn tenant_create_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let location_conf =
-        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
+    let new_tenant = mgr::create_tenant(
+        state.conf,
+        tenant_conf,
+        target_tenant_id,
+        request_data.shard_parameters,
+        generation,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
+    .await?;

-    let new_tenant = state
-        .tenant_manager
-        .upsert_location(
-            target_tenant_id,
-            location_conf,
-            None,
-            SpawnMode::Create,
-            &ctx,
-        )
-        .await?;
-
-    let Some(new_tenant) = new_tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    new_tenant
+    if let res @ Err(_) = new_tenant
        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await?;
+        .await
+    {
+        // This shouldn't happen because we just created the tenant directory
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // to load, so, nothing can really fail during load.
+        // Don't do cleanup because we don't know how we got here.
+        // The tenant will likely be in `Broken` state and subsequent
+        // calls will fail.
+        res.context("created tenant failed to become active")
+            .map_err(ApiError::InternalServerError)?;
+    }

    json_response(
        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
+        TenantCreateResponse(new_tenant.tenant_id()),
    )
 }

@@ -1358,57 +1289,16 @@ async fn put_tenant_location_config_handler(

    state
        .tenant_manager
-        .upsert_location(
-            tenant_shard_id,
-            location_conf,
-            flush,
-            tenant::SpawnMode::Normal,
-            &ctx,
-        )
-        .await?;
-
-    if let Some(_flush_ms) = flush {
-        match state
-            .secondary_controller
-            .upload_tenant(tenant_shard_id)
-            .await
-        {
-            Ok(()) => {
-                tracing::info!("Uploaded heatmap during flush");
-            }
-            Err(e) => {
-                tracing::warn!("Failed to flush heatmap: {e}");
-            }
-        }
-    } else {
-        tracing::info!("No flush requested when configuring");
-    }
+        .upsert_location(tenant_shard_id, location_conf, flush, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }

-async fn list_location_config_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let slots = state.tenant_manager.list();
-    let result = LocationConfigListResponse {
-        tenant_shards: slots
-            .into_iter()
-            .map(|(tenant_shard_id, slot)| {
-                let v = match slot {
-                    TenantSlot::Attached(t) => Some(t.get_location_conf()),
-                    TenantSlot::Secondary(s) => Some(s.get_location_conf()),
-                    TenantSlot::InProgress(_) => None,
-                };
-                (tenant_shard_id, v)
-            })
-            .collect(),
-    };
-    json_response(StatusCode::OK, result)
-}
-
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1710,13 +1600,12 @@ async fn disk_usage_eviction_run(
        )));
    };

-    let eviction_state = state.disk_usage_eviction_state.clone();
+    let state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &eviction_state,
+        &state,
        storage,
        usage,
-        &state.tenant_manager,
        config.eviction_order,
        &cancel,
    )
@@ -1744,21 +1633,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn secondary_download_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    state
-        .secondary_controller
-        .download_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1945,15 +1819,15 @@ pub fn make_router(
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
+        .put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
+            api_handler(r, tenant_shard_split_handler)
+        })
        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
-        .get("/v1/location_config", |r| {
-            api_handler(r, list_location_config_handler)
-        })
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
@@ -1975,10 +1849,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
-            |r| api_handler(r, timeline_preserve_initdb_handler),
-        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
@@ -2034,9 +1904,6 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
-            api_handler(r, secondary_download_handler)
-        })
        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,4 +1,3 @@
-#![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]

 mod auth;
@@ -118,10 +117,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

-/// Per-tenant copy of their remote heatmap, downloaded into the local
-/// tenant path while in secondary mode.
-pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};

 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum StorageTimeOperation {
+pub enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
    LayerFlush,

@@ -55,20 +55,20 @@ pub(crate) enum StorageTimeOperation {
    CreateTenant,
 }

-pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "shard_id", "timeline_id"],
+        &["operation", "tenant_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });

-pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "shard_id", "timeline_id"],
+        &["operation", "tenant_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -150,44 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) struct GetVectoredLatency {
-    map: EnumMap<TaskKind, Option<Histogram>>,
-}
-
-impl GetVectoredLatency {
-    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
-    // cardinality of the metric.
-    const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
-
-    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
-        self.map[task_kind].as_ref()
-    }
-}
-
-pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored",
-        &["task_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    GetVectoredLatency {
-        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
-
-            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
-                let task_kind = task_kind.into();
-                Some(inner.with_label_values(&[task_kind]))
-            } else {
-                None
-            }
-        })),
-    }
-});
-
-pub(crate) struct PageCacheMetricsForTaskKind {
+pub struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,

@@ -196,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

-pub(crate) struct PageCacheMetrics {
+pub struct PageCacheMetrics {
    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }

@@ -218,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
@@ -280,9 +243,10 @@ impl PageCacheMetrics {
    }
 }

-pub(crate) struct PageCacheSizeMetrics {
+pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

+    pub current_bytes_ephemeral: UIntGauge,
    pub current_bytes_immutable: UIntGauge,
    pub current_bytes_materialized_page: UIntGauge,
 }
@@ -296,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
-    Lazy::new(|| PageCacheSizeMetrics {
-        max_bytes: {
-            register_uint_gauge!(
-                "pageserver_page_cache_size_max_bytes",
-                "Maximum size of the page cache in bytes"
-            )
-            .expect("failed to define a metric")
-        },
-        current_bytes_immutable: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["immutable"])
-                .unwrap()
-        },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
-    });
+pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
+    max_bytes: {
+        register_uint_gauge!(
+            "pageserver_page_cache_size_max_bytes",
+            "Maximum size of the page cache in bytes"
+        )
+        .expect("failed to define a metric")
+    },
+
+    current_bytes_ephemeral: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["ephemeral"])
+            .unwrap()
+    },
+    current_bytes_immutable: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["immutable"])
+            .unwrap()
+    },
+    current_bytes_materialized_page: {
+        PAGE_CACHE_SIZE_CURRENT_BYTES
+            .get_metric_with_label_values(&["materialized_page"])
+            .unwrap()
+    },
+});

 pub(crate) mod page_cache_eviction_metrics {
    use std::num::NonZeroUsize;
@@ -374,6 +343,15 @@ pub(crate) mod page_cache_eviction_metrics {
    }
 }

+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_cache_acquire_pinned_slot_seconds",
+        "Time spent acquiring a pinned slot in the page cache",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -410,7 +388,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
        "Last record LSN grouped by timeline",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -419,7 +397,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -437,7 +415,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
        "pageserver_remote_physical_size",
        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -470,7 +448,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_current_logical_size",
        "Current logical size grouped by timeline",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define current logical size metric")
 });
@@ -619,7 +597,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_broken_tenants_count",
        "Set of broken tenants",
-        &["tenant_id", "shard_id"]
+        &["tenant_id"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });
@@ -639,7 +617,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -648,7 +626,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -667,7 +645,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
        "Number of layers evicted from the pageserver",
-        &["tenant_id", "shard_id", "timeline_id"]
+        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -762,13 +740,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub(crate) struct EvictionsWithLowResidenceDuration {
+pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
    threshold: Duration,
    counter: Option<IntCounter>,
 }

-pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
+pub struct EvictionsWithLowResidenceDurationBuilder {
    data_source: &'static str,
    threshold: Duration,
 }
@@ -964,12 +942,11 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
-        &["operation", "tenant_id", "shard_id", "timeline_id"]
+        &["operation", "tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });

-#[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
    use super::*;

@@ -989,20 +966,6 @@ pub(crate) mod virtual_file_descriptor_cache {
    // ```
 }

-#[cfg(not(test))]
-pub(crate) mod virtual_file_io_engine {
-    use super::*;
-
-    pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
-        register_uint_gauge_vec!(
-            "pageserver_virtual_file_io_engine_kind",
-            "The configured io engine for VirtualFile",
-            &["kind"],
-        )
-        .unwrap()
-    });
-}
-
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
@@ -1046,7 +1009,7 @@ pub enum SmgrQueryType {
 }

 #[derive(Debug)]
-pub(crate) struct SmgrQueryTimePerTimeline {
+pub struct SmgrQueryTimePerTimeline {
    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }

@@ -1054,7 +1017,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
-        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+        &["smgr_query_type", "tenant_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -1121,9 +1084,8 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
+    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
@@ -1131,7 +1093,7 @@ impl SmgrQueryTimePerTimeline {
                .get_metric_with_label_values(&[op.into()])
                .unwrap();
            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
                .unwrap();
            GlobalAndPerTimelineHistogram {
                global,
@@ -1151,7 +1113,6 @@ impl SmgrQueryTimePerTimeline {

 #[cfg(test)]
 mod smgr_query_time_tests {
-    use pageserver_api::shard::TenantShardId;
    use strum::IntoEnumIterator;
    use utils::id::{TenantId, TimelineId};

@@ -1178,10 +1139,7 @@ mod smgr_query_time_tests {
        for op in &ops {
            let tenant_id = TenantId::generate();
            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(
-                &TenantShardId::unsharded(tenant_id),
-                &timeline_id,
-            );
+            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

            let get_counts = || {
                let global: u64 = ops
@@ -1223,8 +1181,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub(crate) struct BasebackupQueryTime(HistogramVec);
-pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub struct BasebackupQueryTime(HistogramVec);
+pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
@@ -1244,7 +1202,7 @@ impl DurationResultObserver for BasebackupQueryTime {
    }
 }

-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
        "Number of live network connections",
@@ -1262,13 +1220,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
        "Number of ongoing calls to remote timeline client. \
         Used to populate pageserver_remote_timeline_client_calls_started. \
         This metric is not useful for sampling from Prometheus, but useful in tests.",
-        &[
-            "tenant_id",
-            "shard_id",
-            "timeline_id",
-            "file_kind",
-            "op_kind"
-        ],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
    )
    .expect("failed to define a metric")
 });
@@ -1289,23 +1241,22 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
-    Lazy::new(|| {
-        register_int_counter_vec!(
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation is scheduled.",
-        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
    )
-        .expect("failed to define a metric")
-    });
+    .expect("failed to define a metric")
+});

 static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_finished",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
-        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
    )
    .expect("failed to define a metric")
 });
@@ -1418,8 +1369,6 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
    pub(crate) upload_heatmap_duration: Histogram,
-    pub(crate) download_heatmap: IntCounter,
-    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
@@ -1437,16 +1386,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
-    download_heatmap: register_int_counter!(
-        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
-    download_layer: register_int_counter!(
-        "pageserver_secondary_download_layer",
-        "Number of downloads of layers by secondary mode locations"
-    )
-    .expect("failed to define a metric"),
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1716,7 +1655,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub(crate) struct StorageTimeMetricsTimer {
+pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
 }
@@ -1741,7 +1680,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub(crate) struct StorageTimeMetrics {
+pub struct StorageTimeMetrics {
    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
    timeline_sum: Counter,
    /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1751,19 +1690,14 @@ pub(crate) struct StorageTimeMetrics {
 }

 impl StorageTimeMetrics {
-    pub fn new(
-        operation: StorageTimeOperation,
-        tenant_id: &str,
-        shard_id: &str,
-        timeline_id: &str,
-    ) -> Self {
+    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
        let operation: &'static str = operation.into();

        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
            .unwrap();
        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
            .unwrap();
        let global_histogram = STORAGE_TIME_GLOBAL
            .get_metric_with_label_values(&[operation])
@@ -1785,7 +1719,7 @@ impl StorageTimeMetrics {
 }

 #[derive(Debug)]
-pub(crate) struct TimelineMetrics {
+pub struct TimelineMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
@@ -1815,66 +1749,40 @@ impl TimelineMetrics {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let flush_time_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::LayerFlush,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
-        let compact_time_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::Compact,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
-        let create_images_time_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::CreateImages,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
-        let logical_size_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::LogicalSize,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
+        let flush_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
+        let compact_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
+        let create_images_time_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
+        let logical_size_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
        let imitate_logical_size_histo = StorageTimeMetrics::new(
            StorageTimeOperation::ImitateLogicalSize,
            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
-        let load_layer_map_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::LoadLayerMap,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
-        let garbage_collect_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::Gc,
-            &tenant_id,
-            &shard_id,
            &timeline_id,
        );
+        let load_layer_map_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
+        let garbage_collect_histo =
+            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let evictions = EVICTIONS
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
            .build(&tenant_id, &shard_id, &timeline_id);
@@ -1928,17 +1836,15 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ =
-                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1951,42 +1857,29 @@ impl Drop for TimelineMetrics {
        // outlive an individual smgr connection, but not the timeline.

        for op in StorageTimeOperation::VARIANTS {
-            let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
-                op,
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
-            let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
-                op,
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
+            let _ =
+                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
-            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
        }

        for op in SmgrQueryType::iter() {
            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
                op.into(),
                tenant_id,
-                shard_id,
                timeline_id,
            ]);
        }
    }
 }

-pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
-    // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
-        let tid = tenant_shard_id.tenant_id.to_string();
-        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    }
-
+pub fn remove_tenant_metrics(tenant_id: &TenantId) {
+    let tid = tenant_id.to_string();
+    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -2034,9 +1927,8 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
    }
 }

-pub(crate) struct RemoteTimelineClientMetrics {
+pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
-    shard_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
@@ -2048,7 +1940,6 @@ impl RemoteTimelineClientMetrics {
    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_shard_id.tenant_id.to_string(),
-            shard_id: format!("{}", tenant_shard_id.shard_slug()),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -2063,9 +1954,8 @@ impl RemoteTimelineClientMetrics {
            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
+                        &self.tenant_id.to_string(),
+                        &self.timeline_id.to_string(),
                    ])
                    .unwrap(),
            )
@@ -2100,9 +1990,8 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
                .get_metric_with_label_values(&[
-                    &self.tenant_id,
-                    &self.shard_id,
-                    &self.timeline_id,
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
                    key.0,
                    key.1,
                ])
@@ -2132,9 +2021,8 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id,
-                    &self.shard_id,
-                    &self.timeline_id,
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
                    key.0,
                    key.1,
                ])
@@ -2153,9 +2041,8 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id,
-                    &self.shard_id,
-                    &self.timeline_id,
+                    &self.tenant_id.to_string(),
+                    &self.timeline_id.to_string(),
                    key.0,
                    key.1,
                ])
@@ -2299,7 +2186,6 @@ impl Drop for RemoteTimelineClientMetrics {
    fn drop(&mut self) {
        let RemoteTimelineClientMetrics {
            tenant_id,
-            shard_id,
            timeline_id,
            remote_physical_size_gauge,
            calls_unfinished_gauge,
@@ -2309,7 +2195,6 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
-                shard_id,
                timeline_id,
                a,
                b,
@@ -2318,7 +2203,6 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
-                shard_id,
                timeline_id,
                a,
                b,
@@ -2327,7 +2211,6 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
                tenant_id,
-                shard_id,
                timeline_id,
                a,
                b,
@@ -2335,16 +2218,18 @@ impl Drop for RemoteTimelineClientMetrics {
        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
-            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        }
    }
 }

 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub(crate) trait MeasureRemoteOp: Sized {
+pub trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        metrics: Arc<RemoteTimelineClientMetrics>,
@@ -2352,6 +2237,8 @@ pub(crate) trait MeasureRemoteOp: Sized {
        let start = Instant::now();
        MeasuredRemoteOp {
            inner: self,
+            tenant_id,
+            timeline_id,
            file_kind,
            op,
            start,
@@ -2363,10 +2250,12 @@ pub(crate) trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}

 pin_project! {
-    pub(crate) struct MeasuredRemoteOp<F>
+    pub struct MeasuredRemoteOp<F>
    {
        #[pin]
        inner: F,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        start: Instant,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -550,6 +550,7 @@ impl PageCache {
    // not require changes.

    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
        match tokio::time::timeout(
            // Choose small timeout, neon_smgr does its own retries.
            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
@@ -562,6 +563,7 @@ impl PageCache {
                res.expect("this semaphore is never closed"),
            )),
            Err(_timeout) => {
+                timer.stop_and_discard();
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
                );
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,10 +13,7 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
-use futures::stream::FuturesUnordered;
 use futures::Stream;
-use futures::StreamExt;
-use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -24,14 +21,10 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::{ShardCount, ShardNumber};
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
-use std::borrow::Cow;
-use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -46,7 +39,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
-use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
@@ -61,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::Version;
+use crate::pgdatadir_mapping::{rel_block_to_key, Version};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -69,13 +61,9 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
-use crate::tenant::timeline::WaitLsnError;
-use crate::tenant::GetTimelineError;
-use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

-use pageserver_api::key::rel_block_to_key;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -282,13 +270,6 @@ async fn page_service_conn_main(
    }
 }

-/// While a handler holds a reference to a Timeline, it also holds a the
-/// timeline's Gate open.
-struct HandlerTimeline {
-    timeline: Arc<Timeline>,
-    _guard: GateGuard,
-}
-
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -300,72 +281,6 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
-
-    /// See [`Self::cache_timeline`] for usage.
-    ///
-    /// Note on size: the typical size of this map is 1.  The largest size we expect
-    /// to see is the number of shards divided by the number of pageservers (typically < 2),
-    /// or the ratio used when splitting shards (i.e. how many children created from one)
-    /// parent shard, where a "large" number might be ~8.
-    shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
-}
-
-#[derive(thiserror::Error, Debug)]
-enum PageStreamError {
-    /// We encountered an error that should prompt the client to reconnect:
-    /// in practice this means we drop the connection without sending a response.
-    #[error("Reconnect required: {0}")]
-    Reconnect(Cow<'static, str>),
-
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
-
-    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error")]
-    Read(#[source] PageReconstructError),
-
-    /// Ran out of time waiting for an LSN
-    #[error("LSN timeout: {0}")]
-    LsnTimeout(WaitLsnError),
-
-    /// The entity required to serve the request (tenant or timeline) is not found,
-    /// or is not found in a suitable state to serve a request.
-    #[error("Not found: {0}")]
-    NotFound(Cow<'static, str>),
-
-    /// Request asked for something that doesn't make sense, like an invalid LSN
-    #[error("Bad request: {0}")]
-    BadRequest(Cow<'static, str>),
-}
-
-impl From<PageReconstructError> for PageStreamError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::Shutdown,
-            e => Self::Read(e),
-        }
-    }
-}
-
-impl From<GetActiveTimelineError> for PageStreamError {
-    fn from(value: GetActiveTimelineError) -> Self {
-        match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
-            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
-            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
-        }
-    }
-}
-
-impl From<WaitLsnError> for PageStreamError {
-    fn from(value: WaitLsnError) -> Self {
-        match value {
-            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
-            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
-        }
-    }
 }

 impl PageServerHandler {
@@ -381,64 +296,13 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
-            shard_timelines: HashMap::new(),
        }
    }

-    /// Future that completes when we need to shut down the connection.
-    ///
-    /// We currently need to shut down when any of the following happens:
-    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// 2. task_mgr requests shutdown of the connection
-    ///
-    /// NB on (1): the connection's lifecycle is not actually tied to any of the
-    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
-    /// implementation to be responsive to timeline cancellation because
-    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
-    /// We currently do the easy thing and terminate the connection if any of the
-    /// shard_timelines gets cancelled. But really, we cuold spend more effort
-    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
-    /// dropping the guard.
-    ///
-    /// NB: keep in sync with [`Self::is_connection_cancelled`]
-    async fn await_connection_cancelled(&self) {
-        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
-        // that cost every time we check for cancellation.
-        tokio::time::sleep(Duration::from_millis(10)).await;
-
-        // This function is never called concurrently with code that adds timelines to shard_timelines,
-        // which is enforced by the borrow checker (the future returned by this function carries the
-        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
-        // missing any inserts to the map.
-
-        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
-        use futures::future::Either;
-        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
-        cancellation_sources.extend(
-            self.shard_timelines
-                .values()
-                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
-        );
-        FuturesUnordered::from_iter(cancellation_sources)
-            .next()
-            .await;
-    }
-
-    /// Checking variant of [`Self::await_connection_cancelled`].
-    fn is_connection_cancelled(&self) -> bool {
-        task_mgr::is_shutdown_requested()
-            || self
-                .shard_timelines
-                .values()
-                .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
-    }
-
-    /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
-    /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect
-    /// cancellation if there aren't any timelines in the cache.
-    ///
-    /// If calling from a function that doesn't use the `[Self::shard_timelines]` cache, then pass in the
-    /// timeline cancellation token.
+    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
+    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
+    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
+    /// in the flush.
    async fn flush_cancellable<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -451,9 +315,6 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = self.await_connection_cancelled() => {
-                Err(QueryError::Shutdown)
-            }
            _ = cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
@@ -529,7 +390,7 @@ impl PageServerHandler {

    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
-        &mut self,
+        &self,
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -540,6 +401,10 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        // Note that since one connection may contain getpage requests that target different
+        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
+        // that we look up here may not be the one that serves all the actual requests: we will double
+        // check the mapping of key->shard later before calling into Timeline for getpage requests.
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
            ShardSelector::First,
@@ -560,15 +425,27 @@ impl PageServerHandler {
            None
        };

+        // Check that the timeline exists
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        // Avoid starting new requests if the timeline has already started shutting down,
+        // and block timeline shutdown until this request is complete, or drops out due
+        // to cancellation.
+        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        self.flush_cancellable(pgb, &timeline.cancel).await?;
+
+        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

        loop {
            let msg = tokio::select! {
                biased;

-                _ = self.await_connection_cancelled() => {
+                _ = timeline.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -602,36 +479,40 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
+                        self.handle_get_rel_exists_request(&timeline, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
+                        self.handle_get_nblocks_request(&timeline, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
-                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
+                        self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
-                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
+                        self.handle_db_size_request(&timeline, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
@@ -639,47 +520,32 @@ impl PageServerHandler {
                }
            };

-            match response {
-                Err(PageStreamError::Shutdown) => {
+            if let Err(e) = &response {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
+                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
                    return Err(QueryError::Shutdown);
                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if self.is_connection_cancelled() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e);
-                        span.in_scope(|| {
-                            error!("error reading relation or page version: {full:#}")
-                        });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
-                    });
-
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &tenant.cancel).await?;
-                }
            }
+
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // here includes cancellation which is not an error.
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });
+
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
+            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -826,7 +692,7 @@ impl PageServerHandler {
        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
-    ) -> Result<Lsn, PageStreamError> {
+    ) -> anyhow::Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -857,34 +723,24 @@ impl PageServerHandler {
            }
        } else {
            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
+                anyhow::bail!("invalid LSN(0) in request");
            }
            timeline.wait_lsn(lsn, ctx).await?;
        }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
+        anyhow::ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
        Ok(lsn)
    }

    async fn handle_get_rel_exists_request(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        &self,
+        timeline: &Timeline,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists);
-
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -900,18 +756,11 @@ impl PageServerHandler {
    }

    async fn handle_get_nblocks_request(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        &self,
+        timeline: &Timeline,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
-
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize);
-
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -927,18 +776,11 @@ impl PageServerHandler {
    }

    async fn handle_db_size_request(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        &self,
+        timeline: &Timeline,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
-
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize);
-
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -960,164 +802,16 @@ impl PageServerHandler {
        }))
    }

-    /// For most getpage requests, we will already have a Timeline to serve the request: this function
-    /// looks up such a Timeline synchronously and without touching any global state.
-    fn get_cached_timeline_for_page(
-        &mut self,
-        req: &PagestreamGetPageRequest,
-    ) -> Result<&Arc<Timeline>, Key> {
-        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
-            // Fastest path: single sharded case
-            if first_idx.shard_count < ShardCount(2) {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let key = rel_block_to_key(req.rel, req.blkno);
-            let shard_num = first_timeline
-                .timeline
-                .get_shard_identity()
-                .get_shard_number(&key);
-
-            // Fast path: matched the first timeline in our local handler map.  This case is common if
-            // only one shard per tenant is attached to this pageserver.
-            if first_timeline.timeline.get_shard_identity().number == shard_num {
-                return Ok(&first_timeline.timeline);
-            }
-
-            let shard_index = ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_timeline.timeline.get_shard_identity().count,
-            };
-
-            // Fast-ish path: timeline is in the connection handler's local cache
-            if let Some(found) = self.shard_timelines.get(&shard_index) {
-                return Ok(&found.timeline);
-            }
-
-            key
-        } else {
-            rel_block_to_key(req.rel, req.blkno)
-        };
-
-        Err(key)
-    }
-
-    /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable
-    /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`]
-    /// again.
-    ///
-    /// Note that all the Timelines in this cache are for the same timeline_id: they're differ
-    /// in which shard they belong to.  When we serve a getpage@lsn request, we choose a shard
-    /// based on key.
-    ///
-    /// The typical size of this cache is 1, as we generally create shards to distribute work
-    /// across pageservers, so don't tend to have multiple shards for the same tenant on the
-    /// same pageserver.
-    fn cache_timeline(
-        &mut self,
-        timeline: Arc<Timeline>,
-    ) -> Result<&Arc<Timeline>, GetActiveTimelineError> {
-        let gate_guard = timeline
-            .gate
-            .enter()
-            .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?;
-
-        let shard_index = timeline.tenant_shard_id.to_index();
-        let entry = self
-            .shard_timelines
-            .entry(shard_index)
-            .or_insert(HandlerTimeline {
-                timeline,
-                _guard: gate_guard,
-            });
-
-        Ok(&entry.timeline)
-    }
-
-    /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with
-    /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver.  If no such
-    /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node).
-    async fn load_timeline_for_page(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key: Key,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // Slow path: we must call out to the TenantManager to find the timeline for this Key
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key))
-            .await?;
-
-        self.cache_timeline(timeline)
-    }
-
-    async fn get_timeline_shard_zero(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
-        // This is a borrow-checker workaround: we can't return from inside of the  `if let Some` because
-        // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable
-        // ref to salf.  So instead, we first build a bool, and then return while not borrowing self.
-        let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() {
-            idx.shard_number == ShardNumber(0)
-        } else {
-            false
-        };
-
-        if have_cached {
-            let entry = self.shard_timelines.iter().next().unwrap();
-            Ok(&entry.1.timeline)
-        } else {
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                .await?;
-            Ok(self.cache_timeline(timeline)?)
-        }
-    }
-
-    async fn handle_get_page_at_lsn_request(
-        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+    async fn do_handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = match self.get_cached_timeline_for_page(req) {
-            Ok(tl) => tl,
-            Err(key) => {
-                match self
-                    .load_timeline_for_page(tenant_id, timeline_id, key)
-                    .await
-                {
-                    Ok(t) => t,
-                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                        // We already know this tenant exists in general, because we resolved it at
-                        // start of connection.  Getting a NotFound here indicates that the shard containing
-                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                        // mapping is out of date.
-                        //
-                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                        // and talk to a different pageserver.
-                        return Err(PageStreamError::Reconnect(
-                            "getpage@lsn request routed to wrong shard".into(),
-                        ));
-                    }
-                    Err(e) => return Err(e.into()),
-                }
-            }
-        };
-
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
-
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;
-
        let page = timeline
            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
            .await?;
@@ -1127,6 +821,58 @@ impl PageServerHandler {
        }))
    }

+    async fn handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<PagestreamBeMessage> {
+        let key = rel_block_to_key(req.rel, req.blkno);
+        if timeline.get_shard_identity().is_key_local(&key) {
+            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
+                .await
+        } else {
+            // The Tenant shard we looked up at connection start does not hold this particular
+            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
+            // has multiple shards for the same tenant.
+            //
+            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
+            let timeline = match self
+                .get_active_tenant_timeline(
+                    timeline.tenant_shard_id.tenant_id,
+                    timeline.timeline_id,
+                    ShardSelector::Page(key),
+                )
+                .await
+            {
+                Ok(t) => t,
+                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                    // We already know this tenant exists in general, because we resolved it at
+                    // start of connection.  Getting a NotFound here indicates that the shard containing
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                }
+                Err(e) => return Err(e.into()),
+            };
+
+            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
+            // the GateGuard was already held over the whole connection.
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+
+            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
+                .await
+        }
+    }
+
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
@@ -1265,7 +1011,9 @@ impl PageServerHandler {
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
        Ok(timeline)
    }
 }
@@ -1674,8 +1422,7 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::Cancelled
-            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                QueryError::Shutdown
            }
            e => QueryError::Other(anyhow::anyhow!(e)),
@@ -1688,15 +1435,14 @@ enum GetActiveTimelineError {
    #[error(transparent)]
    Tenant(GetActiveTenantError),
    #[error(transparent)]
-    Timeline(#[from] GetTimelineError),
+    Timeline(anyhow::Error),
 }

 impl From<GetActiveTimelineError> for QueryError {
    fn from(e: GetActiveTimelineError) -> Self {
        match e {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
            GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
        }
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,13 +13,8 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::{
-    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
-    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
-};
-use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::key::is_rel_block_key;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -27,12 +22,14 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
-use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

+/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
+pub type BlockNumber = u32;
+
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -163,7 +160,7 @@ impl Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub(crate) async fn get_rel_page_at_lsn(
+    pub async fn get_rel_page_at_lsn(
        &self,
        tag: RelTag,
        blknum: BlockNumber,
@@ -194,7 +191,7 @@ impl Timeline {
    }

    // Get size of a database in blocks
-    pub(crate) async fn get_db_size(
+    pub async fn get_db_size(
        &self,
        spcnode: Oid,
        dbnode: Oid,
@@ -214,7 +211,7 @@ impl Timeline {
    }

    /// Get size of a relation file
-    pub(crate) async fn get_rel_size(
+    pub async fn get_rel_size(
        &self,
        tag: RelTag,
        version: Version<'_>,
@@ -259,7 +256,7 @@ impl Timeline {
    }

    /// Does relation exist?
-    pub(crate) async fn get_rel_exists(
+    pub async fn get_rel_exists(
        &self,
        tag: RelTag,
        version: Version<'_>,
@@ -294,7 +291,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn list_rels(
+    pub async fn list_rels(
        &self,
        spcnode: Oid,
        dbnode: Oid,
@@ -322,7 +319,7 @@ impl Timeline {
    }

    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
+    pub async fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -335,7 +332,7 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_size(
+    pub async fn get_slru_segment_size(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -348,7 +345,7 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_exists(
+    pub async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -375,7 +372,7 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub(crate) async fn find_lsn_for_timestamp(
+    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
@@ -455,7 +452,7 @@ impl Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    pub(crate) async fn is_latest_commit_timestamp_ge_than(
+    pub async fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -478,7 +475,7 @@ impl Timeline {
    /// Obtain the possible timestamp range for the given lsn.
    ///
    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub(crate) async fn get_timestamp_for_lsn(
+    pub async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -534,35 +531,8 @@ impl Timeline {
        Ok(Default::default())
    }

-    pub(crate) async fn get_slru_keyspace(
-        &self,
-        version: Version<'_>,
-        ctx: &RequestContext,
-    ) -> Result<KeySpace, PageReconstructError> {
-        let mut accum = KeySpaceAccum::new();
-
-        for kind in SlruKind::iter() {
-            let mut segments: Vec<u32> = self
-                .list_slru_segments(kind, version, ctx)
-                .await?
-                .into_iter()
-                .collect();
-            segments.sort_unstable();
-
-            for seg in segments {
-                let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
-
-                accum.add_range(
-                    slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
-                );
-            }
-        }
-
-        Ok(accum.to_keyspace())
-    }
-
    /// Get a list of SLRU segments
-    pub(crate) async fn list_slru_segments(
+    pub async fn list_slru_segments(
        &self,
        kind: SlruKind,
        version: Version<'_>,
@@ -578,7 +548,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_relmap_file(
+    pub async fn get_relmap_file(
        &self,
        spcnode: Oid,
        dbnode: Oid,
@@ -591,7 +561,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub(crate) async fn list_dbdirs(
+    pub async fn list_dbdirs(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -605,7 +575,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_twophase_file(
+    pub async fn get_twophase_file(
        &self,
        xid: TransactionId,
        lsn: Lsn,
@@ -616,7 +586,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub(crate) async fn list_twophase_files(
+    pub async fn list_twophase_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -630,7 +600,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_control_file(
+    pub async fn get_control_file(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -638,7 +608,7 @@ impl Timeline {
        self.get(CONTROLFILE_KEY, lsn, ctx).await
    }

-    pub(crate) async fn get_checkpoint(
+    pub async fn get_checkpoint(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -646,7 +616,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub(crate) async fn list_aux_files(
+    pub async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -1568,6 +1538,381 @@ struct SlruSegmentDirectory {

 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

+// Layout of the Key address space
+//
+// The Key struct, used to address the underlying key-value store, consists of
+// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
+// all the data and metadata keys into those 18 bytes.
+//
+// Principles for the mapping:
+//
+// - Things that are often accessed or modified together, should be close to
+//   each other in the key space. For example, if a relation is extended by one
+//   block, we create a new key-value pair for the block data, and update the
+//   relation size entry. Because of that, the RelSize key comes after all the
+//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
+//   to each other.
+//
+// The key space is divided into four major sections, identified by the first
+// byte, and the form a hierarchy:
+//
+// 00 Relation data and metadata
+//
+//   DbDir    () -> (dbnode, spcnode)
+//   Filenodemap
+//   RelDir   -> relnode forknum
+//       RelBlocks
+//       RelSize
+//
+// 01 SLRUs
+//
+//   SlruDir  kind
+//   SlruSegBlocks segno
+//   SlruSegSize
+//
+// 02 pg_twophase
+//
+// 03 misc
+//    Controlfile
+//    checkpoint
+//    pg_version
+//
+// 04 aux files
+//
+// Below is a full list of the keyspace allocation:
+//
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+//
+// Filenodemap:
+// 00 SPCNODE  DBNODE   00000000 00   00000000
+//
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+//
+// RelBlock:
+// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
+//
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+//
+// SlruDir:
+// 01 kind     00000000 00000000 00   00000000
+//
+// SlruSegBlock:
+// 01 kind     00000001 SEGNO    00   BLKNUM
+//
+// SlruSegSize:
+// 01 kind     00000001 SEGNO    00   FFFFFFFF
+//
+// TwoPhaseDir:
+// 02 00000000 00000000 00000000 00   00000000
+//
+// TwoPhaseFile:
+// 02 00000000 00000000 00000000 00   XID
+//
+// ControlFile:
+// 03 00000000 00000000 00000000 00   00000000
+//
+// Checkpoint:
+// 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
+//-- Section 01: relation data and metadata
+
+const DBDIR_KEY: Key = Key {
+    field1: 0x00,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0xffffffff,
+        field5: 0xff,
+        field6: 0xffffffff,
+    }
+}
+
+fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 1,
+    }
+}
+
+pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+
+fn rel_size_to_key(rel: RelTag) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0xffffffff,
+    }
+}
+
+fn rel_key_range(rel: RelTag) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum + 1,
+        field6: 0,
+    }
+}
+
+//-- Section 02: SLRUs
+
+fn slru_dir_to_key(kind: SlruKind) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: blknum,
+    }
+}
+
+fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0xffffffff,
+    }
+}
+
+fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
+    let field2 = match kind {
+        SlruKind::Clog => 0x00,
+        SlruKind::MultiXactMembers => 0x01,
+        SlruKind::MultiXactOffsets => 0x02,
+    };
+
+    Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 1,
+        field6: 0,
+    }
+}
+
+//-- Section 03: pg_twophase
+
+const TWOPHASEDIR_KEY: Key = Key {
+    field1: 0x02,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+fn twophase_file_key(xid: TransactionId) -> Key {
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }
+}
+
+fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+    let (next_xid, overflowed) = xid.overflowing_add(1);
+
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }..Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
+    }
+}
+
+//-- Section 03: Control file
+const CONTROLFILE_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+const CHECKPOINT_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 1,
+};
+
+const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
+// Reverse mappings for a few Keys.
+// These are needed by WAL redo manager.
+
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
+    Ok(match key.field1 {
+        0x00 => (
+            RelTag {
+                spcnode: key.field2,
+                dbnode: key.field3,
+                relnode: key.field4,
+                forknum: key.field5,
+            },
+            key.field6,
+        ),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;
+
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
+fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -23,7 +23,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
+    #[allow(clippy::useless_conversion)]
    pub fn blocks(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
@@ -32,7 +32,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
+    #[allow(clippy::useless_conversion)]
    pub fn blocks_available(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -258,9 +258,6 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

-    /// See [`crate::tenant::secondary`].
-    SecondaryDownloads,
-
    /// See [`crate::tenant::secondary`].
    SecondaryUploads,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,13 +12,12 @@
 //!

 use anyhow::{bail, Context};
-use camino::Utf8Path;
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
-use pageserver_api::models;
+use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TimelineState;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
@@ -52,13 +51,13 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
+use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
-use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -74,7 +73,6 @@ use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
-use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
@@ -91,6 +89,7 @@ use std::fs;
 use std::fs::File;
 use std::io;
 use std::ops::Bound::Included;
+use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -111,7 +110,7 @@ use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
-    id::TimelineId,
+    id::{TenantId, TimelineId},
    lsn::{Lsn, RecordLsn},
 };

@@ -132,13 +131,6 @@ macro_rules! pausable_failpoint {
            .expect("spawn_blocking");
        }
    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
 }

 pub mod blob_io;
@@ -370,13 +362,13 @@ impl WalRedoManager {
 pub enum GetTimelineError {
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
-        tenant_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
        state: TimelineState,
    },
    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
    NotFound {
-        tenant_id: TenantShardId,
+        tenant_id: TenantId,
        timeline_id: TimelineId,
    },
 }
@@ -627,15 +619,9 @@ impl Tenant {
            deletion_queue_client,
        ));

-        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
-        // we shut down while attaching.
-        let Ok(attach_gate_guard) = tenant.gate.enter() else {
-            // We just created the Tenant: nothing else can have shut it down yet
-            unreachable!();
-        };
-
        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
+
        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
@@ -645,8 +631,6 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
-                let _gate_guard = attach_gate_guard;
-
                // Is this tenant being spawned as part of process startup?
                let starting_up = init_order.is_some();
                scopeguard::defer! {
@@ -723,10 +707,6 @@ impl Tenant {
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
-                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
-                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
-                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -821,7 +801,7 @@ impl Tenant {
                    SpawnMode::Create => None,
                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                };
-                match tenant_clone.attach(preload, mode, &ctx).await {
+                match tenant_clone.attach(preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        if let Some(t)=  attach_timer {t.observe_duration();}
@@ -908,20 +888,15 @@ impl Tenant {
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
-        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

-        let preload = match (preload, mode) {
-            (Some(p), _) => p,
-            (None, SpawnMode::Create) => TenantPreload {
-                deleting: false,
-                timelines: HashMap::new(),
-            },
-            (None, SpawnMode::Normal) => {
+        let preload = match preload {
+            Some(p) => p,
+            None => {
                // Deprecated dev mode: load from local disk state instead of remote storage
                // https://github.com/neondatabase/neon/issues/5624
                return self.load_local(ctx).await;
@@ -1029,10 +1004,7 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        fail::fail_point!("attach-before-activate", |_| {
-            anyhow::bail!("attach-before-activate");
-        });
-        failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
+        failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

@@ -1536,6 +1508,10 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::Load)
    }

+    pub(crate) fn tenant_id(&self) -> TenantId {
+        self.tenant_shard_id.tenant_id
+    }
+
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -1551,13 +1527,14 @@ impl Tenant {
        let timeline = timelines_accessor
            .get(&timeline_id)
            .ok_or(GetTimelineError::NotFound {
-                tenant_id: self.tenant_shard_id,
+                tenant_id: self.tenant_shard_id.tenant_id,
                timeline_id,
            })?;

        if active_only && !timeline.is_active() {
+            tracing::warn!("Timeline {} is not active", timeline.timeline_id);
            Err(GetTimelineError::NotActive {
-                tenant_id: self.tenant_shard_id,
+                tenant_id: self.tenant_shard_id.tenant_id,
                timeline_id,
                state: timeline.current_state(),
            })
@@ -1696,13 +1673,9 @@ impl Tenant {
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        if !self.is_active() {
-            if matches!(self.current_state(), TenantState::Stopping { .. }) {
-                return Err(CreateTimelineError::ShuttingDown);
-            } else {
-                return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                    "Cannot create timelines on inactive tenant"
-                )));
-            }
+            return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "Cannot create timelines on inactive tenant"
+            )));
        }

        let _gate = self
@@ -1788,15 +1761,7 @@ impl Tenant {
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
-                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
-                        .await
-                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
-                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
-                            }
-                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
-                        })?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                }

                self.branch_timeline(
@@ -1952,10 +1917,6 @@ impl Tenant {
        self.current_state() == TenantState::Active
    }

-    pub fn generation(&self) -> Generation {
-        self.generation
-    }
-
    /// Changes tenant status to active, unless shutdown was already requested.
    ///
    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
@@ -2069,13 +2030,6 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        // If we're still attaching, fire the cancellation token early to drop out: this
-        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
-        // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
-            self.cancel.cancel();
-        }
-
        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
@@ -2345,32 +2299,6 @@ impl Tenant {
            .clone()
    }

-    /// For API access: generate a LocationConfig equivalent to the one that would be used to
-    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
-    /// rare external API calls, like a reconciliation at startup.
-    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
-
-        let location_config_mode = match conf.location.attach_mode {
-            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
-            AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
-            AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
-        };
-
-        // We have a pageserver TenantConf, we need the API-facing TenantConfig.
-        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
-
-        models::LocationConfig {
-            mode: location_config_mode,
-            generation: self.generation.into(),
-            secondary_conf: None,
-            shard_number: self.shard_identity.number.0,
-            shard_count: self.shard_identity.count.0,
-            shard_stripe_size: self.shard_identity.stripe_size.0,
-            tenant_conf: tenant_config,
-        }
-    }
-
    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
@@ -2378,6 +2306,66 @@ impl Tenant {
    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }
+
+    pub(crate) async fn split_prepare(
+        &self,
+        child_shards: &Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        let timelines = self.timelines.lock().unwrap().clone();
+        for timeline in timelines.values() {
+            let Some(tl_client) = &timeline.remote_client else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            let Some(remote_storage) = &self.remote_storage else {
+                anyhow::bail!("Remote storage is mandatory");
+            };
+
+            // TODO: some higher level should enforce that timeline creation/deletion does not
+            // happen concurrently with splits.  This is impossible to safely coordinate locally
+            // within one single pageserver's view of the world.
+
+            // Upload an index from the parent: this is partly to provide freshness for the
+            // child tenants that will copy it, and partly for general ease-of-debugging: there will
+            // always be a parent shard index in the same generation as we wrote the child shard index.
+            tl_client.schedule_index_upload_for_file_changes()?;
+            tl_client.wait_completion().await?;
+
+            // Shut down the timeline's remote client: this means that the indices we write
+            // for child shards will not be invalidated by the parent shard deleting layers.
+            tl_client.shutdown().await?;
+
+            // Download methods can still be used after shutdown, as they don't flow through the remote client's
+            // queue.
+            // TODO: create a way for remote timeline client to give us a copy of the last IndexPart it uploaded
+            //       without having to download it again.
+            // TODO: carry a cancellation token in here
+            let result = tl_client
+                .download_index_file(CancellationToken::new())
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .await?;
+            let index_part = match result {
+                MaybeDeletedIndexPart::Deleted(_) => {
+                    anyhow::bail!("Timeline deletion happened concurrently with split")
+                }
+                MaybeDeletedIndexPart::IndexPart(p) => p,
+            };
+
+            for child_shard in child_shards {
+                upload_index_part(
+                    remote_storage,
+                    child_shard,
+                    &timeline.timeline_id,
+                    self.generation,
+                    &index_part,
+                    &self.cancel,
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2616,9 +2604,7 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
-            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2631,15 +2617,13 @@ impl Tenant {
                // the tenant might be ignored and reloaded, so first remove any previous set
                // element. it most likely has already been scraped, as these are manual operations
                // right now. most likely we will add it back very soon.
-                drop(
-                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
-                );
+                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
                false
            } else {
                // add the id to the set right away, there should not be any updates on the channel
                // after
                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid, &shard_id_str])
+                    .with_label_values(&[&tid])
                    .set(1);
                true
            };
@@ -2665,7 +2649,7 @@ impl Tenant {
                    counted_broken = true;
                    // insert the tenant_id (back) into the set
                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid, &shard_id_str])
+                        .with_label_values(&[&tid])
                        .inc();
                }
            }
@@ -2729,7 +2713,7 @@ impl Tenant {
            Ok(LocationConf::attached_single(
                tenant_conf,
                Generation::none(),
-                &models::ShardParameters::default(),
+                &ShardParameters::default(),
            ))
        } else {
            // FIXME If the config file is not found, assume that we're attaching
@@ -2805,10 +2789,6 @@ impl Tenant {
 "#
        .to_string();

-        fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
-        });
-
        // Convert the config to a toml file.
        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;

@@ -3234,55 +3214,6 @@ impl Tenant {
        .await
    }

-    async fn upload_initdb(
-        &self,
-        timelines_path: &Utf8PathBuf,
-        pgdata_path: &Utf8PathBuf,
-        timeline_id: &TimelineId,
-    ) -> anyhow::Result<()> {
-        let Some(storage) = &self.remote_storage else {
-            // No remote storage?  No upload.
-            return Ok(());
-        };
-
-        let temp_path = timelines_path.join(format!(
-            "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
-        ));
-
-        scopeguard::defer! {
-            if let Err(e) = fs::remove_file(&temp_path) {
-                error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
-            }
-        }
-
-        let (pgdata_zstd, tar_zst_size) =
-            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
-
-        pausable_failpoint!("before-initdb-upload");
-
-        backoff::retry(
-            || async {
-                self::remote_timeline_client::upload_initdb_dir(
-                    storage,
-                    &self.tenant_shard_id.tenant_id,
-                    timeline_id,
-                    pgdata_zstd.try_clone().await?,
-                    tar_zst_size,
-                    &self.cancel,
-                )
-                .await
-            },
-            |_| false,
-            3,
-            u32::MAX,
-            "persist_initdb_tar_zst",
-            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
-        )
-        .await?;
-
-        Ok(())
-    }
-
    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization completes, tar up the temp dir and upload it to S3.
    ///
@@ -3322,18 +3253,6 @@ impl Tenant {
            let Some(storage) = &self.remote_storage else {
                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
            };
-            if existing_initdb_timeline_id != timeline_id {
-                let source_path = &remote_initdb_archive_path(
-                    &self.tenant_shard_id.tenant_id,
-                    &existing_initdb_timeline_id,
-                );
-                let dest_path =
-                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
-                storage
-                    .copy_object(source_path, dest_path)
-                    .await
-                    .context("copy initdb tar")?;
-            }
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
@@ -3344,26 +3263,68 @@ impl Tenant {
                )
                .await
                .context("download initdb tar")?;
-
-            scopeguard::defer! {
-                if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
-                    error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
-                }
-            }
-
            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
+
+            tokio::fs::remove_file(&initdb_tar_zst_path)
+                .await
+                .or_else(|e| {
+                    if e.kind() == std::io::ErrorKind::NotFound {
+                        // If something else already removed the file, ignore the error
+                        Ok(())
+                    } else {
+                        Err(e)
+                    }
+                })
+                .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
        } else {
-            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
+            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
-                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
+            if let Some(storage) = &self.remote_storage {
+                if self.tenant_shard_id().is_zero() {
+                    let temp_path = timelines_path.join(format!(
+                        "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
+                    ));
+
+                    let (pgdata_zstd, tar_zst_size) =
+                        import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
+                    backoff::retry(
+                        || async {
+                            self::remote_timeline_client::upload_initdb_dir(
+                                storage,
+                                &self.tenant_shard_id.tenant_id,
+                                &timeline_id,
+                                pgdata_zstd.try_clone().await?,
+                                tar_zst_size,
+                                &self.cancel,
+                            )
+                            .await
+                        },
+                        |_| false,
+                        3,
+                        u32::MAX,
+                        "persist_initdb_tar_zst",
+                        backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+                    )
                    .await?;
+
+                    tokio::fs::remove_file(&temp_path)
+                        .await
+                        .or_else(|e| {
+                            if e.kind() == std::io::ErrorKind::NotFound {
+                                // If something else already removed the file, ignore the error
+                                Ok(())
+                            } else {
+                                Err(e)
+                            }
+                        })
+                        .with_context(|| format!("tempfile removal {temp_path}"))?;
+                }
            }
        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
@@ -3652,9 +3613,6 @@ impl Tenant {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

-        // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
-
        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
            .unwrap()
@@ -3724,6 +3682,10 @@ impl Tenant {

        Ok(())
    }
+
+    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
+        self.tenant_conf.read().unwrap().tenant_conf
+    }
 }

 fn remove_timeline_and_uninit_mark(
@@ -3749,6 +3711,140 @@ fn remove_timeline_and_uninit_mark(
    Ok(())
 }

+pub(crate) async fn create_tenant_files(
+    conf: &'static PageServerConf,
+    location_conf: &LocationConf,
+    tenant_shard_id: &TenantShardId,
+) -> anyhow::Result<Utf8PathBuf> {
+    let target_tenant_directory = conf.tenant_path(tenant_shard_id);
+    anyhow::ensure!(
+        !target_tenant_directory
+            .try_exists()
+            .context("check existence of tenant directory")?,
+        "tenant directory already exists",
+    );
+
+    let temporary_tenant_dir =
+        path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
+    debug!("Creating temporary directory structure in {temporary_tenant_dir}");
+
+    // top-level dir may exist if we are creating it through CLI
+    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
+        format!("could not create temporary tenant directory {temporary_tenant_dir}")
+    })?;
+
+    let creation_result = try_create_target_tenant_dir(
+        conf,
+        location_conf,
+        tenant_shard_id,
+        &temporary_tenant_dir,
+        &target_tenant_directory,
+    )
+    .await;
+
+    if creation_result.is_err() {
+        error!(
+            "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data"
+        );
+        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
+            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
+        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
+            error!(
+                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
+            )
+        }
+    }
+
+    creation_result?;
+
+    Ok(target_tenant_directory)
+}
+
+async fn try_create_target_tenant_dir(
+    conf: &'static PageServerConf,
+    location_conf: &LocationConf,
+    tenant_shard_id: &TenantShardId,
+    temporary_tenant_dir: &Utf8Path,
+    target_tenant_directory: &Utf8Path,
+) -> Result<(), anyhow::Error> {
+    let temporary_tenant_timelines_dir = rebase_directory(
+        &conf.timelines_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?;
+    let temporary_legacy_tenant_config_path = rebase_directory(
+        &conf.tenant_config_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_location_config_path(tenant_shard_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config_at(
+        tenant_shard_id,
+        &temporary_tenant_config_path,
+        &temporary_legacy_tenant_config_path,
+        location_conf,
+    )
+    .await?;
+
+    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
+        format!(
+            "create tenant {} temporary timelines directory {}",
+            tenant_shard_id, temporary_tenant_timelines_dir,
+        )
+    })?;
+    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
+        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
+    });
+
+    // Make sure the current tenant directory entries are durable before renaming.
+    // Without this, a crash may reorder any of the directory entry creations above.
+    crashsafe::fsync(temporary_tenant_dir)
+        .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
+
+    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
+        format!(
+            "move tenant {} temporary directory {} into the permanent one {}",
+            tenant_shard_id, temporary_tenant_dir, target_tenant_directory
+        )
+    })?;
+    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
+        format!(
+            "get tenant {} dir parent for {}",
+            tenant_shard_id, target_tenant_directory,
+        )
+    })?;
+    crashsafe::fsync(target_dir_parent).with_context(|| {
+        format!(
+            "fsync renamed directory's parent {} for tenant {}",
+            target_dir_parent, tenant_shard_id,
+        )
+    })?;
+
+    Ok(())
+}
+
+fn rebase_directory(
+    original_path: &Utf8Path,
+    base: &Utf8Path,
+    new_base: &Utf8Path,
+) -> anyhow::Result<Utf8PathBuf> {
+    let relative_path = original_path.strip_prefix(base).with_context(|| {
+        format!(
+            "Failed to strip base prefix '{}' off path '{}'",
+            base, original_path
+        )
+    })?;
+    Ok(new_base.join(relative_path))
+}
+
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
 async fn run_initdb(
@@ -3778,25 +3874,27 @@ async fn run_initdb(
        .env_clear()
        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        // If the `select!` below doesn't finish the `wait_with_output`,
+        // let the task get `wait()`ed for asynchronously by tokio.
+        // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
+        // TODO: fix for this is non-trivial, see
+        // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
+        //
+        .kill_on_drop(true)
        .spawn()?;

-    // Ideally we'd select here with the cancellation token, but the problem is that
-    // we can't safely terminate initdb: it launches processes of its own, and killing
-    // initdb doesn't kill them. After we return from this function, we want the target
-    // directory to be able to be cleaned up.
-    // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_command.wait_with_output().await?;
-    if !initdb_output.status.success() {
-        return Err(InitdbError::Failed(
-            initdb_output.status,
-            initdb_output.stderr,
-        ));
-    }
-
-    // This isn't true cancellation support, see above. Still return an error to
-    // excercise the cancellation code path.
-    if cancel.is_cancelled() {
-        return Err(InitdbError::Cancelled);
+    tokio::select! {
+        initdb_output = initdb_command.wait_with_output() => {
+            let initdb_output = initdb_output?;
+            if !initdb_output.status.success() {
+                return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
+            }
+        }
+        _ = cancel.cancelled() => {
+            return Err(InitdbError::Cancelled);
+        }
    }

    Ok(())
@@ -3804,7 +3902,7 @@ async fn run_initdb(

 impl Drop for Tenant {
    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id);
+        remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
    }
 }
 /// Dump contents of a layer file to stdout.
@@ -3841,9 +3939,7 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
-    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
-    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
    use std::fs;
    use std::sync::Arc;
@@ -3910,6 +4006,8 @@ pub(crate) mod harness {
    pub struct TenantHarness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
+        // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id
+        pub(crate) tenant_id: TenantId,
        pub tenant_shard_id: TenantShardId,
        pub generation: Generation,
        pub shard: ShardIndex,
@@ -3971,6 +4069,7 @@ pub(crate) mod harness {
            Ok(Self {
                conf,
                tenant_conf,
+                tenant_id,
                tenant_shard_id,
                generation: Generation::new(0xdeadbeef),
                shard: ShardIndex::unsharded(),
@@ -4052,7 +4151,7 @@ pub(crate) mod harness {
                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                    tenant
-                        .attach(Some(preload), SpawnMode::Normal, ctx)
+                        .attach(Some(preload), ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
@@ -5232,7 +5331,7 @@ mod tests {
                assert_eq!(
                    e,
                    GetTimelineError::NotFound {
-                        tenant_id: tenant.tenant_shard_id,
+                        tenant_id: tenant.tenant_shard_id.tenant_id,
                        timeline_id: TIMELINE_ID,
                    }
                )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
John Spray	e22c5e7a9f	tests: extend test_sharding for splitting	2024-01-03 15:51:05 +00:00
John Spray	71ff404e6c	control_plane: support for shard splitting	2024-01-03 15:51:05 +00:00
John Spray	e9f7510abf	pageserver: implement shard splitting	2024-01-03 15:51:05 +00:00
John Spray	3e214f91de	pageserver_api: models for splitting	2024-01-03 15:51:05 +00:00
John Spray	9beceff829	pageserver_api: make ShardIdentity::stripe_size public	2024-01-03 15:43:56 +00:00
John Spray	2aeb3d49bb	pageserver_api: enrich types for use in reconciler	2024-01-03 15:43:56 +00:00
John Spray	a5af60269d	tests: fix type decorations on helpers	2024-01-03 15:43:56 +00:00
John Spray	2c0f9b65a8	pageserver: enrich a remote client log message	2024-01-03 15:43:56 +00:00
John Spray	f15291b606	tests: update restart+regress tests to use sharding	2024-01-03 15:43:56 +00:00
John Spray	7d47363568	test: add test_sharding	2024-01-03 15:43:56 +00:00
John Spray	00060a5e8f	compute_api: add shard_stripe_size to ComputeSpec	2024-01-03 15:43:56 +00:00
John Spray	c570c78816	test: update fixtures for sharding/attachment service	2024-01-03 15:43:56 +00:00
John Spray	150c3c79a5	utils: implement FromStr for NodeId	2024-01-03 15:43:56 +00:00
John Spray	79094c20bd	control_plane: major rework of attachment_service	2024-01-03 15:43:56 +00:00
John Spray	73de3568d6	pageserver/client: update APIs for sharding	2024-01-03 15:43:47 +00:00
John Spray	06b8131668	pageserver: refactor creation API (add ShardParams)	2024-01-03 15:43:47 +00:00
John Spray	b16960b3a6	pageserver: only upload initdb from shard 0	2024-01-03 15:18:33 +00:00
John Spray	50997249dd	pageserver_api: accomodate TenantShardId in location conf	2024-01-03 15:18:32 +00:00
John Spray	914a06ada7	pageserver_api: add a public version of TenantHistorySize	2024-01-03 15:18:03 +00:00
John Spray	6ff6242827	libs/http: add Timeout error	2024-01-03 14:38:16 +00:00
Konstantin Knizhnik	48e373c147	Support sharding at compute side refer #5508	2024-01-03 14:38:16 +00:00