aws dns

fmt
hakari
2026-03-12 04:40:38 +00:00 · 2024-04-24 14:41:13 +01:00 · 2024-04-24 14:41:13 +01:00 · 2024-04-24 14:41:13 +01:00 · 2024-04-24 14:41:13 +01:00 · 2024-04-24 14:41:12 +01:00
265 changed files with 13852 additions and 7133 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,6 +22,7 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:
        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
        # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 outputs:
  dsn:
    description: 'Created Branch DSN (for main database)'
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 runs:
  using: "composite"
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
    default: 15
  api_host:
    desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
  provisioner:
    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console.stage.neon.tech
+    default: console-stage.neon.build
 runs:
  using: "composite"
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,6 +18,7 @@ on:
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: false
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,15 +147,16 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
+                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'
        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -171,7 +172,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                   { "platform": "rds-aurora"   }]')
+                                                     { "platform": "rds-aurora"   }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -190,7 +191,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                    { "platform": "rds-aurora",   "scale": "10" }]')
+                                                     { "platform": "rds-aurora",   "scale": "10" }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -253,6 +254,9 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -270,11 +274,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -401,11 +409,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -507,11 +519,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -597,11 +613,15 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERY="SELECT version();"
+        QUERIES=("SELECT version()")
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+          QUERIES+=("SHOW neon.tenant_id")
          QUERIES+=("SHOW neon.timeline_id")
        fi
-        psql ${CONNSTR} -c "${QUERY}"
+
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,6 +21,7 @@ defaults:
 concurrency:
  group: build-build-tools-image-${{ inputs.image-tag }}
  cancel-in-progress: false
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -735,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
      - uses: docker/login-action@v3
        with:
@@ -792,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -865,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1
    steps:
      - name: Checkout
@@ -1127,15 +1127,15 @@ jobs:
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
@@ -1144,6 +1144,7 @@ jobs:
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
              -f deployStorageController=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,7 +28,9 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
          COMMIT_SHA: ${{ github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,6 +20,7 @@ defaults:
 concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
  cancel-in-progress: false
 permissions: {}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:
  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: ubuntu-latest
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
    steps:
      - name: check if ecr image are present
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,41 +79,55 @@ jobs:
            fi
          done
-      - name: Set PR's status to pending and request a remote CI test
+      - name: Set e2e-platforms
        id: e2e-platforms
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # Default set of platforms to run e2e tests on
-          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          platforms='["docker", "k8s"]'
          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
          # If the workflow run is not a pull request, add k8s-neonvm to the list.
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
                  # no-op
                  ;;
              esac
            done
          else
            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
          fi
-          curl -f -X POST \
+          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
          -H "Accept: application/vnd.github.v3+json" \
          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
          --data \
            "{
              \"state\": \"pending\",
              \"context\": \"neon-cloud-e2e\",
              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
            }"
-          curl -f -X POST \
+      - name: Set PR's status to pending and request a remote CI test
-          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+        env:
-          -H "Accept: application/vnd.github.v3+json" \
+          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          --data \
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-            "{
+        run: |
-              \"ref\": \"main\",
+          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
-              \"inputs\": {
+
-                \"ci_job_name\": \"neon-cloud-e2e\",
+          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-                \"commit_hash\": \"$COMMIT_SHA\",
+            --method POST \
-                \"remote_repo\": \"${{ github.repository }}\",
+            --raw-field "state=pending" \
-                \"storage_image_tag\": \"${TAG}\",
+            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-                \"compute_image_tag\": \"${TAG}\",
+            --raw-field "context=neon-cloud-e2e"
-                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+
-              }
+          gh workflow --repo ${REMOTE_REPO} \
-            }"
+            run testing.yml \
              --ref "main" \
              --raw-field "ci_job_name=neon-cloud-e2e" \
              --raw-field "commit_hash=$COMMIT_SHA" \
              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
              --raw-field "storage_image_tag=${TAG}" \
              --raw-field "compute_image_tag=${TAG}" \
              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/attachment_service @neondatabase/storage
+/storage_controller @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/attachment_service",
+    "control_plane/storcon_cli",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,6 +12,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -43,6 +44,7 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -55,6 +57,7 @@ aws-sdk-s3 = "1.14"
 aws-sdk-iam = "1.15.0"
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-smithy-runtime = "1.1.8"
 aws-credential-types = "1.1.4"
 aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
 aws-types = "1.1.7"
@@ -96,7 +99,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.11"
+hyper-tungstenite = "0.13.0"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -105,7 +108,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.13", features=["default", "lasso"] }
+measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -154,11 +158,12 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
 svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.1"
+test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
@@ -190,11 +195,11 @@ env_logger = "0.10"
 log = "0.4"
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -234,7 +239,7 @@ tonic-build = "0.9"
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
 # bug fixes for UUID
 parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && mv protoc/include/google /usr/local/include/google \
    && rm -rf protoc.zip protoc
 # s5cmd
 ENV S5CMD_VERSION=2.2.2
 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
    && chmod +x s5cmd \
    && mv s5cmd /usr/local/bin/s5cmd
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,6 +944,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,9 +818,15 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
+
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    let mut func = || {
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                        client.simple_query("SET neon.forward_ddl = false")?;
                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                        Ok::<_, anyhow::Error>(())
                    };
                    func().context("apply_config setup cloud_admin")?;
                    drop(client);
                    // reconnect with connstring with expected name
@@ -832,24 +838,29 @@ impl ComputeNode {
        };
        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client.simple_query("SET neon.forward_ddl = false")?;
+        client
            .simple_query("SET neon.forward_ddl = false")
            .context("apply_config SET neon.forward_ddl = false")?;
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
+        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client)?;
+        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client)?;
+        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client)?;
+        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)
            .context("apply_config handle_role_deletions")?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )?;
+        )
-        handle_extensions(spec, &mut client)?;
+        .context("apply_config handle_grants")?;
-        handle_extension_neon(&mut client)?;
+        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        create_availability_check_data(&mut client)?;
+        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
        create_availability_check_data(&mut client)
            .context("apply_config create_availability_check_data")?;
        // 'Close' connection
        drop(client);
@@ -857,7 +868,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
+            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
        Ok(())
    }
@@ -1262,10 +1273,12 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);
-        self.ext_download_progress
+        if download_size.is_ok() {
-            .write()
+            self.ext_download_progress
-            .expect("bad lock")
+                .write()
-            .insert(ext_archive_name.to_string(), (download_start, true));
+                .expect("bad lock")
                .insert(ext_archive_name.to_string(), (download_start, true));
        }
        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
        }
    }
    if cfg!(target_os = "linux") {
        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
        // disabled), then the control plane has enabled swap and we should set
        // dynamic_shared_memory_type = 'mmap'.
        //
        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
            // ignore any errors - they may be expected to occur under certain situations (e.g. when
            // not running in Linux).
            .unwrap_or_else(|_| String::new());
        if overcommit_memory_contents.trim() == "2" {
            let opt = GenericOption {
                name: "dynamic_shared_memory_type".to_owned(),
                value: Some("mmap".to_owned()),
                vartype: "enum".to_owned(),
            };
            write!(file, "{}", opt.to_pg_setting())?;
        }
    }
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }
-trait GenericOptionExt {
+pub trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
+                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -698,7 +698,8 @@ pub fn handle_grants(
        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)
                .context("handle_grants handle_extension_anon")?;
        }
    }
@@ -743,21 +744,24 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
-    // DISABLED due to compute node unpinning epic
+    let query = "ALTER EXTENSION neon UPDATE";
-    // let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
-    // info!("update neon extension version with query: {}", query);
+    if let Err(e) = client.simple_query(query) {
-    // client.simple_query(query)?;
+        error!(
            "failed to upgrade neon extension during `handle_extension_neon`: {}",
            e
        );
    }
    Ok(())
 }
 #[instrument(skip_all)]
-pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
+pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
-    info!("handle neon extension upgrade (not really)");
+    info!("handle neon extension upgrade");
-    // DISABLED due to compute node unpinning epic
+    let query = "ALTER EXTENSION neon UPDATE";
-    // let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension version with query: {}", query);
-    // info!("update neon extension version with query: {}", query);
+    client.simple_query(query)?;
    // client.simple_query(query)?;
    Ok(())
 }
@@ -806,43 +810,40 @@ $$;"#,
        "",
        "",
        "",
        "",
        // Add new migrations below.
        r#"
 DO $$
 DECLARE
    role_name TEXT;
 BEGIN
    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
    LOOP
        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
    END LOOP;
 END
 $$;"#,
    ];
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    let mut func = || {
-    client.simple_query(query)?;
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
        client.simple_query(query)?;
-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
+        client.simple_query(query)?;
-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
+        client.simple_query(query)?;
-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
+        client.simple_query(query)?;
-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
+        client.simple_query(query)?;
        Ok::<_, anyhow::Error>(())
    };
    func().context("handle_migrations prepare")?;
-    query = "SELECT id FROM neon_migration.migration_id";
+    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
+    let row = client
        .query_one(query, &[])
        .context("handle_migrations get migration_id")?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;
-    query = "BEGIN";
+    let query = "BEGIN";
-    client.simple_query(query)?;
+    client
        .simple_query(query)
        .context("handle_migrations begin")?;
    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -850,7 +851,9 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
+            client.simple_query(migration).with_context(|| {
                format!("handle_migrations current_migration={}", current_migration)
            })?;
        }
        current_migration += 1;
    }
@@ -858,10 +861,14 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client.simple_query(&setval)?;
+    client
        .simple_query(&setval)
        .context("handle_migrations update id")?;
-    query = "COMMIT";
+    let query = "COMMIT";
-    client.simple_query(query)?;
+    client
        .simple_query(query)
        .context("handle_migrations commit")?;
    info!(
        "Ran {} migrations",
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,7 +86,10 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
+
    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
        fill_rust_env_vars(background_command),
    ));
    filled_cmd.envs(envs);
    let pid_file_to_check = match &initial_pid_file {
@@ -268,6 +271,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }
 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
    cmd
 }
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::{
+use pageserver_api::controller_api::PlacementPolicy;
    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }
        Some(("set-state", subcommand_args)) => {
            let pageserver = get_pageserver(env, subcommand_args)?;
            let scheduling = subcommand_args.get_one("scheduling");
            let availability = subcommand_args.get_one("availability");
            let storage_controller = StorageController::from_env(env);
            storage_controller
                .node_configure(NodeConfigureRequest {
                    node_id: pageserver.conf.id,
                    scheduling: scheduling.cloned(),
                    availability: availability.cloned(),
                })
                .await?;
        }
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1248,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1434,6 +1417,7 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
@@ -1515,12 +1499,6 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("set-state")
                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
                    .about("Set scheduling or availability state of pageserver node")
                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
    pub listen_addr: Option<String>,
 }
 impl Default for SafekeeperConf {
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
            listen_addr: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,6 +389,10 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
                image_layer_creation_check_threshold: settings
                    .remove("image_layer_creation_check_threshold")
                    .map(|x| x.parse::<u8>())
                    .transpose()
                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
    pub listen_addr: String,
    pub http_base_url: String,
 }
 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
            listen_addr.clone()
        } else {
            "127.0.0.1".to_string()
        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
            listen_addr,
        }
    }
    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
+    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
+        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
    }
    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -111,8 +118,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();
-        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();
@@ -139,7 +146,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -0,0 +1,23 @@
 [package]
 name = "storcon_cli"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 [dependencies]
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -0,0 +1,681 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::Url;
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 #[derive(Subcommand, Debug)]
 enum Command {
    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
    /// since pageservers auto-register when they start up
    NodeRegister {
        #[arg(long)]
        node_id: NodeId,
        #[arg(long)]
        listen_pg_addr: String,
        #[arg(long)]
        listen_pg_port: u16,
        #[arg(long)]
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
    },
    /// Modify a node's configuration in the storage controller
    NodeConfigure {
        #[arg(long)]
        node_id: NodeId,
        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
        /// manually mark a node offline
        #[arg(long)]
        availability: Option<NodeAvailabilityArg>,
        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
        tenant_id: TenantId,
        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
        /// or is in the normal attached state with N secondary locations (`attached:N`)
        #[arg(long)]
        placement: Option<PlacementPolicyArg>,
        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
        /// unavailable, and are only for use in emergencies.
        #[arg(long)]
        scheduling: Option<ShardSchedulingPolicyArg>,
    },
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Delete a tenant in the storage controller, and by extension on pageservers.
    TenantDelete {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Split an existing tenant into a higher number of shards than its current shard count.
    TenantShardSplit {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        shard_count: u8,
        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
        #[arg(long)]
        stripe_size: Option<u32>,
    },
    /// Migrate the attached location for a tenant shard to a specific pageserver.
    TenantShardMigrate {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
        #[arg(long)]
        node: NodeId,
    },
    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
    TenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        config: String,
    },
    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
    /// alternative to the storage controller's scheduling optimization behavior.
    TenantScatter {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Print details about a particular tenant, including all its shards' states.
    TenantDescribe {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
    /// mode so that it can warm up content on a pageserver.
    TenantWarmup {
        #[arg(long)]
        tenant_id: TenantId,
    },
 }
 #[derive(Parser)]
 #[command(
    author,
    version,
    about,
    long_about = "CLI for Storage Controller Support/Debug"
 )]
 #[command(arg_required_else_help(true))]
 struct Cli {
    #[arg(long)]
    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
    api: Url,
    #[arg(long)]
    /// JWT token for authenticating with storage controller.  Depending on the API used, this
    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
    /// a token with both scopes to use with this tool.
    jwt: Option<String>,
    #[command(subcommand)]
    command: Command,
 }
 #[derive(Debug, Clone)]
 struct PlacementPolicyArg(PlacementPolicy);
 impl FromStr for PlacementPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "detached" => Ok(Self(PlacementPolicy::Detached)),
            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
            _ if s.starts_with("attached:") => {
                let mut splitter = s.split(':');
                let _prefix = splitter.next().unwrap();
                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
                    None => Err(anyhow::anyhow!(
                        "Invalid format '{s}', a valid example is 'attached:1'"
                    )),
                }
            }
            _ => Err(anyhow::anyhow!(
                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
 impl FromStr for ShardSchedulingPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
            _ => Err(anyhow::anyhow!(
                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct NodeAvailabilityArg(NodeAvailabilityWrapper);
 impl FromStr for NodeAvailabilityArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }
 struct Client {
    base_url: Url,
    jwt_token: Option<String>,
    client: reqwest::Client,
 }
 impl Client {
    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
        Self {
            base_url,
            jwt_token,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
        }
    }
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> mgmt_api::Result<RS>
    where
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
        // for general purpose API access.
        let url = Url::from_str(&format!(
            "http://{}:{}/{path}",
            self.base_url.host_str().unwrap(),
            self.base_url.port().unwrap()
        ))
        .unwrap();
        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
            builder = builder.json(&body)
        }
        if let Some(jwt_token) = &self.jwt_token {
            builder = builder.header(
                reqwest::header::AUTHORIZATION,
                format!("Bearer {jwt_token}"),
            );
        }
        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
        let response = response.error_from_body().await?;
        response
            .json()
            .await
            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
    }
 }
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
    let mut trimmed = cli.api.to_string();
    trimmed.pop();
    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
    match cli.command {
        Command::NodeRegister {
            node_id,
            listen_pg_addr,
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
        } => {
            storcon_client
                .dispatch::<_, ()>(
                    Method::POST,
                    "control/v1/node".to_string(),
                    Some(NodeRegisterRequest {
                        node_id,
                        listen_pg_addr,
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
                    }),
                )
                .await?;
        }
        Command::TenantCreate { tenant_id } => {
            vps_client
                .tenant_create(&TenantCreateRequest {
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters::default(),
                    placement_policy: Some(PlacementPolicy::Attached(1)),
                    config: TenantConfig::default(),
                })
                .await?;
        }
        Command::TenantDelete { tenant_id } => {
            let status = vps_client
                .tenant_delete(TenantShardId::unsharded(tenant_id))
                .await?;
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
            }
            println!("{table}");
        }
        Command::NodeConfigure {
            node_id,
            availability,
            scheduling,
        } => {
            let req = NodeConfigureRequest {
                node_id,
                availability: availability.map(|a| a.0),
                scheduling,
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/node/{node_id}/config"),
                    Some(req),
                )
                .await?;
        }
        Command::Tenants {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
                "ShardCount",
                "StripeSize",
                "Placement",
                "Scheduling",
            ]);
            for tenant in resp {
                let shard_zero = tenant.shards.into_iter().next().unwrap();
                table.add_row([
                    format!("{}", tenant.tenant_id),
                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                    format!("{:?}", tenant.stripe_size),
                    format!("{:?}", tenant.policy),
                    format!("{:?}", shard_zero.scheduling_policy),
                ]);
            }
            println!("{table}");
        }
        Command::TenantPolicy {
            tenant_id,
            placement,
            scheduling,
        } => {
            let req = TenantPolicyRequest {
                scheduling: scheduling.map(|s| s.0),
                placement: placement.map(|p| p.0),
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/policy"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantShardSplit {
            tenant_id,
            shard_count,
            stripe_size,
        } => {
            let req = TenantShardSplitRequest {
                new_shard_count: shard_count,
                new_stripe_size: stripe_size.map(ShardStripeSize),
            };
            let response = storcon_client
                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/shard_split"),
                    Some(req),
                )
                .await?;
            println!(
                "Split tenant {} into {} shards: {}",
                tenant_id,
                shard_count,
                response
                    .new_shards
                    .iter()
                    .map(|s| format!("{:?}", s))
                    .collect::<Vec<_>>()
                    .join(",")
            );
        }
        Command::TenantShardMigrate {
            tenant_shard_id,
            node,
        } => {
            let req = TenantShardMigrateRequest {
                tenant_shard_id,
                node_id: node,
            };
            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;
            vps_client
                .tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: tenant_conf,
                })
                .await?;
        }
        Command::TenantScatter { tenant_id } => {
            // Find the shards
            let locate_response = storcon_client
                .dispatch::<(), TenantLocateResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}/locate"),
                    None,
                )
                .await?;
            let shards = locate_response.shards;
            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
            let shard_count = shards.len();
            for s in shards {
                let entry = node_to_shards.entry(s.node_id).or_default();
                entry.push(s.shard_id);
            }
            // Load list of available nodes
            let nodes_resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            for node in nodes_resp {
                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
                    node_to_shards.entry(node.id).or_default();
                }
            }
            let max_shard_per_node = shard_count / node_to_shards.len();
            loop {
                let mut migrate_shard = None;
                for shards in node_to_shards.values_mut() {
                    if shards.len() > max_shard_per_node {
                        // Pick the emptiest
                        migrate_shard = Some(shards.pop().unwrap());
                    }
                }
                let Some(migrate_shard) = migrate_shard else {
                    break;
                };
                // Pick the emptiest node to migrate to
                let mut destinations = node_to_shards
                    .iter()
                    .map(|(k, v)| (k, v.len()))
                    .collect::<Vec<_>>();
                destinations.sort_by_key(|i| i.1);
                let (destination_node, destination_count) = *destinations.first().unwrap();
                if destination_count + 1 > max_shard_per_node {
                    // Even the emptiest destination doesn't have space: we're done
                    break;
                }
                let destination_node = *destination_node;
                node_to_shards
                    .get_mut(&destination_node)
                    .unwrap()
                    .push(migrate_shard);
                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
                storcon_client
                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                        Method::PUT,
                        format!("control/v1/tenant/{migrate_shard}/migrate"),
                        Some(TenantShardMigrateRequest {
                            tenant_shard_id: migrate_shard,
                            node_id: destination_node,
                        }),
                    )
                    .await?;
                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
            }
            // Spread the shards across the nodes
        }
        Command::TenantDescribe { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await?;
            let shards = describe_response.shards;
            let mut table = comfy_table::Table::new();
            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
                    .iter()
                    .map(|n| format!("{}", n))
                    .collect::<Vec<_>>()
                    .join(",");
                let mut status_parts = Vec::new();
                if shard.is_reconciling {
                    status_parts.push("reconciling");
                }
                if shard.is_pending_compute_notification {
                    status_parts.push("pending_compute");
                }
                if shard.is_splitting {
                    status_parts.push("splitting");
                }
                let status = status_parts.join(",");
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
                    shard
                        .node_attached
                        .map(|n| format!("{}", n))
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
                    status,
                ]);
            }
            println!("{table}");
        }
        Command::TenantWarmup { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await;
            match describe_response {
                Ok(describe) => {
                    if matches!(describe.policy, PlacementPolicy::Secondary) {
                        // Fine: it's already known to controller in secondary mode: calling
                        // again to put it into secondary mode won't cause problems.
                    } else {
                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
                    }
                }
                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
                    // Fine: this tenant isn't know to the storage controller yet.
                }
                Err(e) => {
                    // Unexpected API error
                    return Err(e.into());
                }
            }
            vps_client
                .location_config(
                    TenantShardId::unsharded(tenant_id),
                    pageserver_api::models::LocationConfig {
                        mode: pageserver_api::models::LocationConfigMode::Secondary,
                        generation: None,
                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
                        shard_number: 0,
                        shard_count: 0,
                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
                        tenant_conf: TenantConfig::default(),
                    },
                    None,
                    true,
                )
                .await?;
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await?;
            let secondary_ps_id = describe_response
                .shards
                .first()
                .unwrap()
                .node_secondary
                .first()
                .unwrap();
            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
            loop {
                let (status, progress) = vps_client
                    .tenant_secondary_download(
                        TenantShardId::unsharded(tenant_id),
                        Some(Duration::from_secs(10)),
                    )
                    .await?;
                println!(
                    "Progress: {}/{} layers, {}/{} bytes",
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
                match status {
                    StatusCode::OK => {
                        println!("Download complete");
                        break;
                    }
                    StatusCode::ACCEPTED => {
                        // Loop
                    }
                    _ => {
                        anyhow::bail!("Unexpected download status: {status}");
                    }
                }
            }
        }
    }
    Ok(())
 }
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli
 [print_schema]
-file = "control_plane/attachment_service/src/schema.rs"
+file = "storage_controller/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]
 [migrations_directory]
-dir = "control_plane/attachment_service/migrations"
+dir = "storage_controller/migrations"
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)
 `storage_controller`:
 Neon storage controller, manages a cluster of pageservers and exposes an API that enables
 managing a many-sharded tenant as a single entity.
 `/control_plane`:
 Local control plane.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -0,0 +1,150 @@
 # Storage Controller
 ## Concepts
 The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
 which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
 It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
 the underlying details of how data is spread across multiple nodes.
 The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
 ## APIs
 The storage controller’s HTTP server implements four logically separate APIs:
 - `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
 - `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
 - `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
 - `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
  to ensure data safety with generation numbers.
 The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
 See the `http.rs` file in the source for where the HTTP APIs are implemented.
 ## Database
 The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.
 The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
 The `diesel` crate is used for defining models & migrations.
 Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
 ### Diesel tip: migrations
 If you need to modify the database schema, here’s how to create a migration:
 - Install the diesel CLI with `cargo install diesel_cli`
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
 ## storcon_cli
 The `storcon_cli` tool enables interactive management of the storage controller. This is usually
 only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
 `storcon_cli --help` includes details on commands.
 # Deploying
 This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
 part of a self-hosted system.
 _General note: since the default `neon_local` environment includes a storage controller, this is a useful
 reference when figuring out deployment._
 ## Database
 It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
 local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
 The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
 Set the URL to the database using the `--database-url` CLI option.
 There is no need to run migrations manually: the storage controller automatically applies migrations
 when it starts up.
 ## Configure pageservers to use the storage controller
 1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
 2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
   with the storage controller when it starts up. See the example below for the format of this file.
 ### Example `metadata.json`
 ```
 {"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
 ```
 - `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
  postgres runs.
 - `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
  the storage controller runs.
 ## Handle compute notifications.
 The storage controller independently moves tenant attachments between pageservers in response to
 changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
 postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
 location changes.
 The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
 JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
 In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
 the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
 the compute hook.
 When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
 the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
 ```
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
    shards: Vec<ComputeHookNotifyRequestShard>,
 }
 ```
 When a notification is received:
 1. Modify postgres configuration for this tenant:
   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
     shards identified by `NodeId` must be converted to the address+port of the node.
   - if stripe_size is not None, set `neon.stripe_size` to this value
 2. Send SIGHUP to postgres to reload configuration
 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
   will retry the notification until it succeeds..
 ### Example notification body
 ```
 {
  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
  "stripe_size": 32768,
  "shards": [
      {"node_id": 344, "shard_number": 0},
      {"node_id": 722, "shard_number": 1},
  ],
 }
 ```
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,11 +10,13 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true
 workspace_hack.workspace = true
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
 [dev-dependencies]
 rand = "0.8"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,14 +7,19 @@
 //! use significantly less memory than this, but can only approximate the cardinality.
 use std::{
-    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash},
-    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::atomic::AtomicU8,
    sync::{atomic::AtomicU8, Arc, RwLock},
 };
-use prometheus::{
+use measured::{
-    core::{self, Describer},
+    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    proto, Opts,
+    metric::{
        group::{Encoding, MetricValue},
        name::MetricNameEncoder,
        Metric, MetricType, MetricVec,
    },
    text::TextEncoder,
    LabelGroup,
 };
 use twox_hash::xxh3;
@@ -93,203 +98,25 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
+pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
-pub struct HyperLogLogVec<const N: usize> {
+pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
-    core: Arc<HyperLogLogVecCore<N>>,
+
 pub struct HyperLogLogState<const N: usize> {
    shards: [AtomicU8; N],
 }
-
+impl<const N: usize> Default for HyperLogLogState<N> {
-struct HyperLogLogVecCore<const N: usize> {
+    fn default() -> Self {
-    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+        #[allow(clippy::declare_interior_mutable_const)]
-    pub desc: core::Desc,
+        const ZERO: AtomicU8 = AtomicU8::new(0);
-    pub opts: Opts,
+        Self { shards: [ZERO; N] }
 }
 impl<const N: usize> core::Collector for HyperLogLogVec<N> {
    fn desc(&self) -> Vec<&core::Desc> {
        vec![&self.core.desc]
    }
    fn collect(&self) -> Vec<proto::MetricFamily> {
        let mut m = proto::MetricFamily::default();
        m.set_name(self.core.desc.fq_name.clone());
        m.set_help(self.core.desc.help.clone());
        m.set_field_type(proto::MetricType::GAUGE);
        let mut metrics = Vec::new();
        for child in self.core.children.read().unwrap().values() {
            child.core.collect_into(&mut metrics);
        }
        m.set_metric(metrics);
        vec![m]
    }
 }
-impl<const N: usize> HyperLogLogVec<N> {
+impl<const N: usize> MetricType for HyperLogLogState<N> {
-    /// Create a new [`HyperLogLogVec`] based on the provided
+    type Metadata = ();
    /// [`Opts`] and partitioned by the given label names. At least one label name must be
    /// provided.
    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
        assert!(N.is_power_of_two());
        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
        let opts = opts.variable_labels(variable_names);
        let desc = opts.describe()?;
        let v = HyperLogLogVecCore {
            children: RwLock::new(HashMap::default()),
            desc,
            opts,
        };
        Ok(Self { core: Arc::new(v) })
    }
    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
    /// of label values (same order as the VariableLabels in Desc). If that combination of
    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
    pub fn get_metric_with_label_values(
        &self,
        vals: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        self.core.get_metric_with_label_values(vals)
    }
    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
    /// occurs.
    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
        self.get_metric_with_label_values(vals).unwrap()
    }
 }
-impl<const N: usize> HyperLogLogVecCore<N> {
+impl<const N: usize> HyperLogLogState<N> {
    pub fn get_metric_with_label_values(
        &self,
        vals: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        let h = self.hash_label_values(vals)?;
        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
            return Ok(metric);
        }
        self.get_or_create_metric(h, vals)
    }
    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
        if vals.len() != self.desc.variable_labels.len() {
            return Err(prometheus::Error::InconsistentCardinality {
                expect: self.desc.variable_labels.len(),
                got: vals.len(),
            });
        }
        let mut h = xxh3::Hash64::default();
        for val in vals {
            h.write(val.as_bytes());
        }
        Ok(h.finish())
    }
    fn get_or_create_metric(
        &self,
        hash: u64,
        label_values: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        let mut children = self.children.write().unwrap();
        // Check exist first.
        if let Some(metric) = children.get(&hash).cloned() {
            return Ok(metric);
        }
        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
        children.insert(hash, metric.clone());
        Ok(metric)
    }
 }
 /// HLL is a probabilistic cardinality measure.
 ///
 /// How to use this time-series for a metric name `my_metrics_total_hll`:
 ///
 /// ```promql
 /// # harmonic mean
 /// 1 / (
 ///     sum (
 ///         2 ^ -(
 ///             # HLL merge operation
 ///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
 ///         )
 ///     ) without (hll_shard)
 /// )
 /// * alpha
 /// * shards_count
 /// * shards_count
 /// ```
 ///
 /// If you want an estimate over time, you can use the following query:
 ///
 /// ```promql
 /// # harmonic mean
 /// 1 / (
 ///     sum (
 ///         2 ^ -(
 ///             # HLL merge operation
 ///             max (
 ///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
 ///             ) by (hll_shard, other_labels...)
 ///         )
 ///     ) without (hll_shard)
 /// )
 /// * alpha
 /// * shards_count
 /// * shards_count
 /// ```
 ///
 /// In the case of low cardinality, you might want to use the linear counting approximation:
 ///
 /// ```promql
 /// # LinearCounting(m, V) = m log (m / V)
 /// shards_count * ln(shards_count /
 ///     # calculate V = how many shards contain a 0
 ///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
 /// )
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
 #[derive(Clone)]
 pub struct HyperLogLog<const N: usize> {
    core: Arc<HyperLogLogCore<N>>,
 }
 impl<const N: usize> HyperLogLog<N> {
    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
        assert!(N.is_power_of_two());
        let opts = Opts::new(name, help);
        Self::with_opts(opts)
    }
    /// Create a [`HyperLogLog`] with the `opts` options.
    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
        Self::with_opts_and_label_values(&opts, &[])
    }
    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
        let desc = opts.describe()?;
        let labels = make_label_pairs(&desc, label_values)?;
        let v = HyperLogLogCore {
            shards: [0; N].map(AtomicU8::new),
            desc,
            labels,
        };
        Ok(Self { core: Arc::new(v) })
    }
    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
        let p = N.ilog2() as u8;
        let j = hash & (N as u64 - 1);
        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
    }
 }
 struct HyperLogLogCore<const N: usize> {
    shards: [AtomicU8; N],
    desc: core::Desc,
    labels: Vec<proto::LabelPair>,
 }
 impl<const N: usize> core::Collector for HyperLogLog<N> {
    fn desc(&self) -> Vec<&core::Desc> {
        vec![&self.core.desc]
    }
-    fn collect(&self) -> Vec<proto::MetricFamily> {
+    fn take_sample(&self) -> [u8; N] {
-        let mut m = proto::MetricFamily::default();
+        self.shards.each_ref().map(|x| {
        m.set_name(self.core.desc.fq_name.clone());
        m.set_help(self.core.desc.help.clone());
        m.set_field_type(proto::MetricType::GAUGE);
        let mut metrics = Vec::new();
        self.core.collect_into(&mut metrics);
        m.set_metric(metrics);
        vec![m]
    }
 }
 impl<const N: usize> HyperLogLogCore<N> {
    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
        self.shards.iter().enumerate().for_each(|(i, x)| {
            let mut shard_label = proto::LabelPair::default();
            shard_label.set_name("hll_shard".to_owned());
            shard_label.set_value(format!("{i}"));
            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
            // This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {
            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+            x.swap(0, std::sync::atomic::Ordering::Relaxed)
            let mut m = proto::Metric::default();
            let mut c = proto::Gauge::default();
            c.set_value(v as f64);
            m.set_gauge(c);
            let mut labels = Vec::with_capacity(self.labels.len() + 1);
            labels.extend_from_slice(&self.labels);
            labels.push(shard_label);
            m.set_label(labels);
            metrics.push(m);
        })
    }
 }
-
+impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
-fn make_label_pairs(
+    for HyperLogLogState<N>
-    desc: &core::Desc,
+{
-    label_values: &[&str],
+    fn write_type(
-) -> prometheus::Result<Vec<proto::LabelPair>> {
+        name: impl MetricNameEncoder,
-    if desc.variable_labels.len() != label_values.len() {
+        enc: &mut TextEncoder<W>,
-        return Err(prometheus::Error::InconsistentCardinality {
+    ) -> Result<(), std::io::Error> {
-            expect: desc.variable_labels.len(),
+        enc.write_type(&name, measured::text::MetricType::Gauge)
            got: label_values.len(),
        });
    }
    fn collect_into(
        &self,
        _: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut TextEncoder<W>,
    ) -> Result<(), std::io::Error> {
        struct I64(i64);
        impl LabelValue for I64 {
            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
                v.write_int(self.0)
            }
        }
-    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+        struct HllShardLabel {
-    if total_len == 0 {
+            hll_shard: i64,
-        return Ok(vec![]);
+        }
    }
-    if desc.variable_labels.is_empty() {
+        impl LabelGroup for HllShardLabel {
-        return Ok(desc.const_label_pairs.clone());
+            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-    }
+                const LE: &LabelName = LabelName::from_str("hll_shard");
                v.write_value(LE, &I64(self.hll_shard));
            }
        }
-    let mut label_pairs = Vec::with_capacity(total_len);
+        self.take_sample()
-    for (i, n) in desc.variable_labels.iter().enumerate() {
+            .into_iter()
-        let mut label_pair = proto::LabelPair::default();
+            .enumerate()
-        label_pair.set_name(n.clone());
+            .try_for_each(|(hll_shard, val)| {
-        label_pair.set_value(label_values[i].to_owned());
+                enc.write_metric_value(
-        label_pairs.push(label_pair);
+                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
                    MetricValue::Int(val as i64),
                )
            })
    }
    for label_pair in &desc.const_label_pairs {
        label_pairs.push(label_pair.clone());
    }
    label_pairs.sort();
    Ok(label_pairs)
 }
 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;
-    use prometheus::{proto, Opts};
+    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
    use rand::{rngs::StdRng, Rng, SeedableRng};
    use rand_distr::{Distribution, Zipf};
    use crate::HyperLogLogVec;
-    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+    #[derive(FixedCardinalityLabel, Clone, Copy)]
-        let mut metrics = vec![];
+    #[label(singleton = "x")]
-        hll.core
+    enum Label {
-            .children
+        A,
-            .read()
+        B,
            .unwrap()
            .values()
            .for_each(|c| c.core.collect_into(&mut metrics));
        metrics
    }
-    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+
    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
        // cannot go through the `hll.collect_family_into` interface yet...
        // need to see if I can fix the conflicting impls problem in measured.
        (
            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
        )
    }
    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
        let mut buckets = [0.0; 32];
-        for metric in metrics.chunks_exact(32) {
+        for &sample in samples {
-            if filter(&metric[0]) {
+            for (i, m) in sample.into_iter().enumerate() {
-                for (i, m) in metric.iter().enumerate() {
+                buckets[i] = f64::max(buckets[i], m as f64);
                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
                }
            }
        }
@@ -437,7 +238,7 @@ mod tests {
    }
    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
        let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {
        for x in iter.by_ref().take(n) {
            set_a.insert(x.to_bits());
-            hll.with_label_values(&["a"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::A))
                .measure(&x.to_bits());
        }
        for x in iter.by_ref().take(n) {
            set_b.insert(x.to_bits());
-            hll.with_label_values(&["b"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::B))
                .measure(&x.to_bits());
        }
        let merge = &set_a | &set_b;
-        let metrics = collect(&hll);
+        let (a, b) = collect(&hll);
-        let len = get_cardinality(&metrics, |_| true);
+        let len = get_cardinality(&[a, b]);
-        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_a = get_cardinality(&[a]);
-        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+        let len_b = get_cardinality(&[b]);
        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,6 +4,17 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 use measured::{
    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
    metric::{
        counter::CounterState,
        gauge::GaugeState,
        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
    FixedCardinalityLabel, LabelGroup, MetricGroup,
 };
 use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -11,6 +22,7 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
 use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -23,13 +35,12 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
 use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
@@ -59,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
    INTERNAL_REGISTRY.register(c)
 }
@@ -96,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
 pub struct BuildInfo {
    pub revision: &'static str,
    pub build_tag: &'static str,
 }
 // todo: allow label group without the set
 impl LabelGroup for BuildInfo {
    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
        const REVISION: &LabelName = LabelName::from_str("revision");
        v.write_value(REVISION, &self.revision);
        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
        v.write_value(BUILD_TAG, &self.build_tag);
    }
 }
 impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_family_into(
        &self,
        name: impl measured::metric::name::MetricNameEncoder,
        enc: &mut T,
    ) -> Result<(), T::Err> {
        enc.write_help(&name, "Build/version information")?;
        GaugeState::write_type(&name, enc)?;
        GaugeState {
            count: std::sync::atomic::AtomicI64::new(1),
        }
        .collect_into(&(), self, name, enc)
    }
 }
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct NeonMetrics {
    #[cfg(target_os = "linux")]
    #[metric(namespace = "process")]
    #[metric(init = measured_process::ProcessCollector::for_self())]
    process: measured_process::ProcessCollector,
    #[metric(namespace = "libmetrics")]
    #[metric(init = LibMetrics::new(build_info))]
    libmetrics: LibMetrics,
 }
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct LibMetrics {
    #[metric(init = build_info)]
    build_info: BuildInfo,
    #[metric(flatten)]
    rusage: Rusage,
    serve_count: CollectionCounter,
 }
 fn write_gauge<Enc: Encoding>(
    x: i64,
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
 ) -> Result<(), Enc::Err> {
    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }
 #[derive(Default)]
 struct Rusage;
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 #[label(singleton = "io_operation")]
 enum IoOp {
    Read,
    Write,
 }
 impl<T: Encoding> MetricGroup<T> for Rusage
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
        let ru = get_rusage_stats();
        enc.write_help(
            DISK_IO,
            "Bytes written and read from disk, grouped by the operation (read|write)",
        )?;
        GaugeState::write_type(DISK_IO, enc)?;
        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
        GaugeState::write_type(MAXRSS, enc)?;
        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
        Ok(())
    }
 }
 #[derive(Default)]
 struct CollectionCounter(CounterState);
 impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
 where
    CounterState: MetricEncoding<T>,
 {
    fn collect_family_into(
        &self,
        name: impl measured::metric::name::MetricNameEncoder,
        enc: &mut T,
    ) -> Result<(), T::Err> {
        self.0.inc();
        enc.write_help(&name, "Number of metric requests made")?;
        self.0.collect_into(&(), NoLabels, name, enc)
    }
 }
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
@@ -105,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    .expect("Failed to register build info metric");
    metric.with_label_values(&[revision, build_tag]).set(1);
 }
 const BYTES_IN_BLOCK: i64 = 512;
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -117,7 +250,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();
    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -151,6 +283,7 @@ macro_rules! register_int_counter_pair_vec {
        }
    }};
 }
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -188,7 +321,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(
        &self,
        vals: &[&str],
    ) -> prometheus::Result<GenericCounterPair<P>> {
        Ok(GenericCounterPair {
            inc: self.inc.get_metric_with_label_values(vals)?,
            dec: self.dec.get_metric_with_label_values(vals)?,
@@ -201,7 +337,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }
-    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
        res[0] = self.inc.remove_label_values(vals);
        res[1] = self.dec.remove_label_values(vals);
    }
@@ -285,3 +421,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
 pub trait CounterPairAssoc {
    const INC_NAME: &'static MetricName;
    const DEC_NAME: &'static MetricName;
    const INC_HELP: &'static str;
    const DEC_HELP: &'static str;
    type LabelGroupSet: LabelGroupSet;
 }
 pub struct CounterPairVec<A: CounterPairAssoc> {
    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
 }
 impl<A: CounterPairAssoc> Default for CounterPairVec<A>
 where
    A::LabelGroupSet: Default,
 {
    fn default() -> Self {
        Self {
            vec: Default::default(),
        }
    }
 }
 impl<A: CounterPairAssoc> CounterPairVec<A> {
    pub fn guard(
        &self,
        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
    ) -> MeasuredCounterPairGuard<'_, A> {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).inc.inc();
        MeasuredCounterPairGuard { vec: &self.vec, id }
    }
    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).inc.inc();
    }
    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).dec.inc();
    }
    pub fn remove_metric(
        &self,
        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
    ) -> Option<MeasuredCounterPairState> {
        let id = self.vec.with_labels(labels);
        self.vec.remove_metric(id)
    }
 }
 impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
 where
    T: ::measured::metric::group::Encoding,
    A: CounterPairAssoc,
    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
        // write decrement first to avoid a race condition where inc - dec < 0
        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
        self.vec
            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
        self.vec
            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
        Ok(())
    }
 }
 #[derive(MetricGroup, Default)]
 pub struct MeasuredCounterPairState {
    pub inc: CounterState,
    pub dec: CounterState,
 }
 impl measured::metric::MetricType for MeasuredCounterPairState {
    type Metadata = ();
 }
 pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
    id: measured::metric::LabelId<A::LabelGroupSet>,
 }
 impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
    fn drop(&mut self) {
        self.vec.get_metric(self.id).dec.inc();
    }
 }
 /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
 struct Inc<T>(T);
 /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
 struct Dec<T>(T);
 impl<T: Encoding> Encoding for Inc<T> {
    type Err = T::Err;
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
 where
    CounterState: MetricEncoding<T>,
 {
    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
        CounterState::write_type(name, &mut enc.0)
    }
    fn collect_into(
        &self,
        metadata: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut Inc<T>,
    ) -> Result<(), T::Err> {
        self.inc.collect_into(metadata, labels, name, &mut enc.0)
    }
 }
 impl<T: Encoding> Encoding for Dec<T> {
    type Err = T::Err;
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 /// Write the dec counter to the encoder
 impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
 where
    CounterState: MetricEncoding<T>,
 {
    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
        CounterState::write_type(name, &mut enc.0)
    }
    fn collect_into(
        &self,
        metadata: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut Dec<T>,
    ) -> Result<(), T::Err> {
        self.dec.collect_into(metadata, labels, name, &mut enc.0)
    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,9 +2,9 @@ use std::str::FromStr;
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`attachment_service::http`]
+/// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TenantId};
 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantPolicyRequest {
    pub placement: Option<PlacementPolicy>,
    pub scheduling: Option<ShardSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -62,12 +68,27 @@ pub struct TenantLocateResponse {
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,
    pub listen_http_addr: String,
    pub listen_http_port: u16,
    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -83,6 +104,8 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
    pub scheduling_policy: ShardSchedulingPolicy,
 }
 /// Explicitly migrating a particular shard is a low level operation
@@ -97,7 +120,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);
 impl UtilizationScore {
@@ -106,7 +129,7 @@ impl UtilizationScore {
    }
 }
-#[derive(Serialize, Clone, Copy)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -129,7 +152,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -155,22 +178,33 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }
-impl FromStr for NodeAvailability {
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-    type Err = anyhow::Error;
+pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
    // for non-essential optimization.
    Active,
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
+    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-        match s {
+    // For example, this still permits a node's attachment location to change to a secondary in
-            // This is used when parsing node configuration requests from neon-local.
+    // response to a node failure, or to assign a new secondary if a node was removed.
-            // Assume the worst possible utilisation score
+    Essential,
-            // and let it get updated via the heartbeats.
+
-            "active" => Ok(Self::Active(UtilizationScore::worst())),
+    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-            "offline" => Ok(Self::Offline),
+    // unavailable, it will not be rescheduled to another node.
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+    Pause,
-        }
+
    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
    Stop,
 }
 impl Default for ShardSchedulingPolicy {
    fn default() -> Self {
        Self::Active
    }
 }
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,10 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};
 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
    pub field6: u32,
 }
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;
 /// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 /// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
 /// The (reserved) key prefix of relation sizes.
 pub const RELATION_SIZE_PREFIX: u8 = 0x81;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x82;
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX
 }
 impl Key {
    /// Check if the key falls in the range of metadata keys.
    pub const fn is_metadata_key(&self) -> bool {
        self.field1 >= METADATA_KEY_BEGIN_PREFIX
    }
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
        Key {
            field1: key[0],
            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
            field5: key[11],
            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
        }
    }
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key(key: &[u8]) -> Self {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }
    /// Extract a metadata key to a writer. The result should always be 16 bytes.
    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
        writer.put_u8(self.field1);
        assert!(self.field2 <= 0xFFFF);
        writer.put_u16(self.field2 as u16);
        writer.put_u32(self.field3);
        writer.put_u32(self.field4);
        writer.put_u8(self.field5);
        writer.put_u32(self.field6);
    }
    /// Get the range of metadata keys.
    pub fn metadata_key_range() -> RangeInclusive<Self> {
        Key {
            field1: METADATA_KEY_BEGIN_PREFIX,
            field2: 0,
            field3: 0,
            field4: 0,
            field5: 0,
            field6: 0,
        }..=Key {
            field1: u8::MAX,
            field2: u16::MAX as u32,
            field3: u32::MAX,
            field4: u32::MAX,
            field5: u8::MAX,
            field6: u32::MAX,
        }
    }
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -48,11 +122,11 @@ impl Key {
        }
    }
-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
        self.add(1)
    }
-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
        let mut key = *self;
        let r = key.field6.overflowing_add(x);
@@ -81,6 +155,8 @@ impl Key {
        key
    }
    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
        }
    }
    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -475,12 +553,14 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key)
 }
 #[inline(always)]
@@ -556,11 +636,14 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;
    use crate::key::is_metadata_key_slice;
    use crate::key::Key;
    use rand::Rng;
    use rand::SeedableRng;
    use super::AUX_KEY_PREFIX;
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -576,4 +659,16 @@ mod tests {
        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
    #[test]
    fn test_metadata_keys() {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
        let mut output_key = Vec::new();
        encoded_key.extract_metadata_key_to_writer(&mut output_key);
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,12 +94,13 @@ impl KeySpace {
    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return;
+                return KeySpace::default();
            }
        };
@@ -112,30 +113,37 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);
        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();
                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
        removed_accum.to_keyspace()
    }
    pub fn start(&self) -> Option<Key> {
@@ -553,7 +561,16 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(2)..Key::from_i128(3),
                Key::from_i128(6)..Key::from_i128(7),
                Key::from_i128(11)..Key::from_i128(12),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -583,7 +600,17 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(3)..Key::from_i128(5),
                Key::from_i128(8)..Key::from_i128(10),
                Key::from_i128(14)..Key::from_i128(15),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -610,7 +637,11 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace::default();
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -637,7 +668,17 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(9)..Key::from_i128(10),
                Key::from_i128(12)..Key::from_i128(15),
                Key::from_i128(17)..Key::from_i128(19),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,6 +20,7 @@ use utils::{
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
    serde_system_time,
 };
 use crate::controller_api::PlacementPolicy;
@@ -301,6 +302,7 @@ pub struct TenantConfig {
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -427,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -745,10 +748,18 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
    pub pid: u32,
    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
    pub kind: Cow<'static, str>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub pid: Option<u32>,
+    pub process: Option<WalRedoManagerProcessStatus>,
 }
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -757,11 +768,7 @@ pub struct WalRedoManagerStatus {
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    #[serde(
+    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
        serialize_with = "opt_ser_rfc3339_millis",
        deserialize_with = "opt_deser_rfc3339_millis"
    )]
    pub heatmap_mtime: Option<SystemTime>,
    /// The number of layers currently on-disk
    pub layers_downloaded: usize,
@@ -774,29 +781,6 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }
 fn opt_ser_rfc3339_millis<S: serde::Serializer>(
    ts: &Option<SystemTime>,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    match ts {
        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
        None => serializer.serialize_none(),
    }
 }
 fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
    match s {
        None => Ok(None),
        Some(s) => humantime::parse_rfc3339(&s)
            .map_err(serde::de::Error::custom)
            .map(Some),
    }
 }
 pub mod virtual_file {
    #[derive(
        Copy,
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use utils::serde_system_time::SystemTime;
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -21,28 +21,9 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
    #[serde(
        serialize_with = "ser_rfc3339_millis",
        deserialize_with = "deser_rfc3339_millis"
    )]
    pub captured_at: SystemTime,
 }
 fn ser_rfc3339_millis<S: serde::Serializer>(
    ts: &SystemTime,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
 }
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -69,7 +50,9 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            utilization_score: u64::MAX,
-            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+            captured_at: SystemTime(
                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
            ),
        };
        let s = serde_json::to_string(&doc).unwrap();
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,15 +5,93 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 /// See docs/rfcs/031-sharding-static.md for an overview of sharding.
 ///
 /// This module contains a variety of types used to represent the concept of sharding
 /// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
 /// we provide an summary here.
 ///
 /// Types used to describe shards:
 /// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
 ///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
 ///   a shard suffix.
 /// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
 /// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
 ///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
 ///   tenant, such as layer files.
 /// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
 ///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
 /// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
 ///   four hex digits.  An unsharded tenant is `0000`.
 /// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
 ///
 /// Types used to describe the parameters for data distribution in a sharded tenant:
 /// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
 ///   multiple shards.  Its value is given in 8kiB pages.
 /// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
 ///   always zero: this is provided for future upgrades that might introduce different
 ///   data distribution schemes.
 ///
 /// Examples:
 /// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
 /// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
 /// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 ///   and their slugs are 0004, 0104, 0204, and 0304.
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
 /// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
 /// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }
 /// Formatting helper, for generating the `shard_id` label in traces.
 struct ShardSlug<'a>(&'a TenantShardId);
 /// TenantShardId globally identifies a particular shard in a particular tenant.
 ///
 /// These are written as `<TenantId>-<ShardSlug>`, for example:
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
 /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
 /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
 ///
 /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible with TenantId: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
@@ -38,6 +116,7 @@ impl ShardCount {
        self.0
    }
    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -53,33 +132,6 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }
 /// TenantShardId identify the units of work for the Pageserver.
 ///
 /// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
 ///
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// Historically, tenants could not have multiple shards, and were identified
 /// by TenantId.  To support this, TenantShardId has a special legacy
 /// mode where `shard_count` is equal to zero: this represents a single-sharded
 /// tenant which should be written as a TenantId with no suffix.
 ///
 /// The human-readable encoding of TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 ///
 /// Note that the binary encoding is _not_ backward compatible, because
 /// at the time sharding is introduced, there are no existing binary structures
 /// containing TenantId that we need to handle.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -111,10 +163,13 @@ impl TenantShardId {
    }
    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -150,9 +205,6 @@ impl TenantShardId {
    }
 }
 /// Formatting helper
 struct ShardSlug<'a>(&'a TenantShardId);
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -222,16 +274,6 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }
 /// For use within the context of a particular tenant, when we need to know which
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -246,6 +288,9 @@ impl ShardIndex {
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -313,6 +358,8 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Note: while human encoding of [`TenantShardId`] is backward and forward
            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -390,16 +437,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// The ShardIdentity contains the information needed for one member of map
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -439,6 +476,9 @@ impl ShardIdentity {
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -487,6 +527,8 @@ impl ShardIdentity {
    }
    /// Return true if the key should be ingested by this shard
    ///
    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -496,8 +538,28 @@ impl ShardIdentity {
        }
    }
    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
    ///
    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
    /// as a symptom of that issue.
    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
            return false;
        }
        let mut hash = murmurhash32(key.field4);
        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
        // The key may be affected by issue #7454: it is an initfork and it would not
        // have mapped to shard 0 until we fixed that issue.
        mapped_shard != ShardNumber(0)
    }
    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
+    /// data store, e.g. during compaction after a split.
    ///
    /// Shards _may_ drop keys which return false here, but are not obliged to.
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -523,7 +585,7 @@ impl ShardIdentity {
    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
@@ -606,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
    // because they must be included in basebackups.
    let is_initfork = key.field5 == INIT_FORKNUM;
    !is_rel_block_key(key) || is_initfork
 }
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
 pub use v14::bindings::{CheckPoint, ControlFileData};
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }
-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
-    // because pg_current_wal_insert_lsn skips page headers.
+    // returns the position just after the page header on the next page. That's where the next
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // record will be inserted. But the page header hasn't actually been written to the WAL
    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
    // error. Because of that, if the insert location is just after a page header, back off to
    // previous page boundary.
    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
    }
    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
    Ok(intermediate_lsns)
 }
@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        client.execute("CREATE table t(x int)", &[])?;
-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
-        // We will use logical message as the padding. We start with detecting how much WAL
+        // will use carefully-sized logical messages to advance WAL insert location such
-        // it takes for one logical message, considering all alignments and headers.
+        // that there is just enough space on the page for the XLOG_SWITCH record.
-        let base_wal_advance = {
+        loop {
            // We start with measuring how much WAL it takes for one logical message,
            // considering all alignments and headers.
            let before_lsn = client.pg_current_wal_insert_lsn()?;
            // Small non-empty message bigger than few bytes is more likely than an empty
            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+
-                + XLOG_SIZE_OF_XLOG_RECORD
+            // Did the record cross a page boundary? If it did, start over. Crossing a
-        };
+            // page boundary adds to the apparent size of the record because of the page
-        let mut remaining_lsn =
+            // header, which throws off the calculation.
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
-        if remaining_lsn < base_wal_advance {
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
-            remaining_lsn += XLOG_BLCKSZ;
+            {
                continue;
            }
            // base_size is the size of a logical message without the payload
            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
            // Is there enough space on the page for another logical message and an
            // XLOG_SWITCH? If not, start over.
            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
                continue;
            }
            // We will write another logical message, such that after the logical message
            // record, there will be space for exactly one XLOG_SWITCH. How large should
            // the logical message's payload be? An XLOG_SWITCH record has no data => its
            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                &[&(repeats as i32)],
            )?;
            break;
        }
        let repeats = 10 + remaining_lsn - base_wal_advance;
        info!(
            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
            client.pg_current_wal_insert_lsn()?,
            remaining_lsn,
            base_wal_advance,
            repeats
        );
        client.execute(
            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
            &[&(repeats as i32)],
        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -134,6 +134,11 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
    pub fn add_trailing_slash(&self) -> Self {
        // Unwrap safety inputs are guararnteed to be valid UTF-8
        Self(format!("{}/", self.0).try_into().unwrap())
    }
 }
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +162,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    ///
-    /// so this method doesnt need to.
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
-    async fn list_prefixes(
+    /// from the absolute root of the bucket.
-        &self,
+    ///
-        prefix: Option<&RemotePath>,
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
-        cancel: &CancellationToken,
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
-        let result = self
+    /// returned in `keys` ().
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
+    ///
-            .await?
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
-            .prefixes;
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
-        Ok(result)
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
    /// because it is for listing files instead of listing
    /// names sharing common prefixes.
    /// For example,
    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
    ///
    /// max_keys limits max number of keys returned; None means unlimited.
    async fn list_files(
        &self,
        prefix: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
            .await?
            .keys;
        Ok(result)
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
    //
    // max_keys limits max number of keys returned; None means unlimited.
    pub async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
        }
    }
    // lists common *prefixes*, if any of files
    // Example:
    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
        }
    }
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -565,6 +509,16 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
    fn from(arr: [(&str, &str); N]) -> Self {
        let map: HashMap<String, String> = arr
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self(map)
    }
 }
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.
 use std::{
-    borrow::Cow,
+    collections::HashSet,
    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };
@@ -22,11 +20,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {
    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
        fn get_all_files<'a, P>(
            directory_path: P,
        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
        where
            P: AsRef<Utf8Path> + Send + Sync + 'a,
        {
            Box::pin(async move {
                let directory_path = directory_path.as_ref();
                if directory_path.exists() {
                    if directory_path.is_dir() {
                        let mut paths = Vec::new();
                        let mut dir_contents = fs::read_dir(directory_path).await?;
                        while let Some(dir_entry) = dir_contents.next_entry().await? {
                            let file_type = dir_entry.file_type().await?;
                            let entry_path =
                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                                    anyhow::Error::msg(format!(
                                        "non-Unicode path: {}",
                                        pb.to_string_lossy()
                                    ))
                                })?;
                            if file_type.is_symlink() {
                                tracing::debug!("{entry_path:?} is a symlink, skipping")
                            } else if file_type.is_dir() {
                                paths.extend(get_all_files(&entry_path).await?.into_iter())
                            } else {
                                paths.push(entry_path);
                            }
                        }
                        Ok(paths)
                    } else {
                        bail!("Path {directory_path:?} is not a directory")
                    }
                } else {
                    Ok(Vec::new())
                }
            })
        }
        Ok(get_all_files(&self.storage_root)
            .await?
            .into_iter()
            .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
        // If there's no trailing slash, we have to start looking from one above: even if
        // `initial_dir` is a directory, we should still list any prefixes in the parent
        // that start with the same string.
        if !full_path.to_string().ends_with('/') {
            initial_dir.pop();
        }
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();
-            if let ListingMode::NoDelimiter = mode {
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
-                let keys = self
+            let keys = self
-                    .list_recursive(prefix)
+                .list_recursive(prefix)
                    .await
                    .map_err(DownloadError::Other)?;
                result.keys = keys
                    .into_iter()
                    .filter(|k| {
                        let path = k.with_base(&self.storage_root);
                        !path.is_dir()
                    })
                    .collect();
                if let Some(max_keys) = max_keys {
                    result.keys.truncate(max_keys.get() as usize);
                }
                return Ok(result);
            }
            let path = match prefix {
                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
                None => Cow::Borrowed(&self.storage_root),
            };
            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
            let keys = keys
                .into_iter()
                .filter(|k| {
                    let path = k.with_base(&self.storage_root);
                    !path.is_dir()
                })
                .collect();
-            // filter out empty directories to mirror s3 behavior.
+            if let ListingMode::NoDelimiter = mode {
-            for prefix in prefixes_to_filter {
+                result.keys = keys;
-                if prefix.is_dir()
+            } else {
-                    && is_directory_empty(&prefix)
+                let mut prefixes = HashSet::new();
-                        .await
+                for key in keys {
-                        .map_err(DownloadError::Other)?
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
-                {
+                    let relative_key = if let Some(prefix) = prefix {
-                    continue;
+                        let mut prefix = prefix.clone();
-                }
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
-
+                        // end up with full file/dir names.
-                let stripped = prefix
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
-                    .strip_prefix(&self.storage_root)
+                        let has_slash = prefix.0.to_string().ends_with('/');
-                    .context("Failed to strip prefix")
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
-                    .and_then(RemotePath::new)
+                            prefix
-                    .expect(
+                        } else {
-                        "We list files for storage root, hence should be able to remote the prefix",
+                            prefix.0.pop();
-                    );
+                            prefix
-
+                        };
-                if prefix.is_dir() {
+
-                    result.prefixes.push(stripped);
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
-                } else {
+                    } else {
-                    result.keys.push(stripped);
+                        key
                    };
                    let relative_key = format!("{}", relative_key);
                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                        let first_part = relative_key
                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
                            .next()
                            .unwrap()
                            .to_owned();
                        prefixes.insert(first_part);
                    } else {
                        result
                            .keys
                            .push(RemotePath::from_string(&relative_key).unwrap());
                    }
                }
                result.prefixes = prefixes
                    .into_iter()
                    .map(|s| RemotePath::from_string(&s).unwrap())
                    .collect();
            }
            if let Some(max_keys) = max_keys {
                result.keys.truncate(max_keys.get() as usize);
            }
            Ok(result)
        };
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }
 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
        if directory_path.exists() {
            if directory_path.is_dir() {
                let mut paths = Vec::new();
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
                    let entry_path =
                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                            anyhow::Error::msg(format!(
                                "non-Unicode path: {}",
                                pb.to_string_lossy()
                            ))
                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                        } else {
                            paths.push(entry_path)
                        }
                    } else {
                        paths.push(entry_path);
                    }
                }
                Ok(paths)
            } else {
                bail!("Path {directory_path:?} is not a directory")
            }
        } else {
            Ok(Vec::new())
        }
    })
 }
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
        let child_sibling =
            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
            listing.keys.into_iter().collect::<HashSet<_>>(),
            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
        );
        // Delimiter: should only go one deep
        let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());
-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(
            listing.keys,
            [RemotePath::from_string("uncle").unwrap()].to_vec()
        );
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("parent").unwrap()].to_vec()
        );
        // Delimiter and prefix without a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-                .to_vec()
+        );
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("grandparent").unwrap()].to_vec()
        );
        Ok(())
    }
    #[tokio::test]
    async fn list_part_component() -> anyhow::Result<()> {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
        // a freeform prefix.
        let _child_a =
            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
        let _child_b =
            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(
                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
                ),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        let mut found_prefixes = listing.prefixes.clone();
        found_prefixes.sort();
        assert_eq!(
            found_prefixes,
            [
                RemotePath::from_string("tenant").unwrap(),
                RemotePath::from_string("tenant-01").unwrap(),
            ]
            .to_vec()
        );
        assert_eq!(listing.keys, [uncle.clone()].to_vec());
        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -178,10 +178,7 @@ impl S3Bucket {
    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
+        let path_string = path.get_path().as_str();
            .get_path()
            .as_str()
            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
+            .or_else(|| {
-            .map(|mut p| {
+                self.prefix_in_bucket.clone().map(|mut s| {
-                // required to end with a separator
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                // otherwise request will return only the entry of a prefix
+                    s
-                if matches!(mode, ListingMode::WithDelimiter)
+                })
                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
        let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
                }
            }
-            result.prefixes.extend(
+            // S3 gives us prefixes like "foo/", we return them like "foo"
-                prefixes
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                    .iter()
+                Some(
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
+                    self.s3_object_to_relative_path(
-            );
+                        o.prefix()?
                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
                    ),
                )
            }));
            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -1050,22 +1046,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path"],
+            vec!["", "some/path", "some/path/"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["/", "/some/path", "/some/path/"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
        ];
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_prefixes(prefix, cancel).await
    }
    async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_files(folder, max_keys, cancel).await
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await
+        .await?
-        .context("client list root prefixes failure")?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );
    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
+        .list(
-        .await
+            Some(&base_prefix.add_trailing_slash()),
-        .context("client list nested prefixes failure")?
+            ListingMode::WithDelimiter,
            None,
            &cancel,
        )
        .await?
        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
 ) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list root files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
    );
    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
            None,
            ListingMode::NoDelimiter,
            Some(NonZeroU32::new(2).unwrap()),
            &cancel,
        )
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);
    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list nested files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
    ctx.client.delete_objects(&[path1, path2], &cancel).await?;
-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
        .client
        .list(None, ListingMode::WithDelimiter, None, &cancel)
        .await?
        .prefixes;
    assert_eq!(prefixes.len(), 1);
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,7 +57,6 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -86,7 +85,6 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -134,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
@@ -148,7 +142,6 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
-    S3Config,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
+        Ok(
-            .await
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
-            .context("list root files failure")?
+                .await
-            .into_iter()
+                .context("list root files failure")?
-            .collect::<HashSet<_>>())
+                .keys
                .into_iter()
                .collect::<HashSet<_>>(),
        )
    }
    let cancel = CancellationToken::new();
@@ -219,7 +222,6 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -248,7 +250,6 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -296,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -310,7 +307,6 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,6 +22,7 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -0,0 +1,21 @@
 //! Wrapper around `std::env::var` for parsing environment variables.
 use std::{fmt::Display, str::FromStr};
 pub fn var<V, E>(varname: &str) -> Option<V>
 where
    V: FromStr<Err = E>,
    E: Display,
 {
    match std::env::var(varname) {
        Ok(s) => Some(
            s.parse()
                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
                .unwrap(),
        ),
        Err(std::env::VarError::NotPresent) => None,
        Err(std::env::VarError::NotUnicode(_)) => {
            panic!("env var {varname} is not unicode")
        }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,6 +63,7 @@ pub mod measured_stream;
 pub mod serde_percent;
 pub mod serde_regex;
 pub mod serde_system_time;
 pub mod pageserver_feedback;
@@ -89,6 +90,10 @@ pub mod yielding_loop;
 pub mod zstd;
 pub mod env;
 pub mod poison;
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -0,0 +1,121 @@
 //!  Protect a piece of state from reuse after it is left in an inconsistent state.
 //!
 //!  # Example
 //!
 //!  ```
 //!  # tokio_test::block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!
 //!  struct State {
 //!    clean: bool,
 //!  }
 //!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
 //!
 //!  let mut mutex_guard = state.lock().await;
 //!  let mut poison_guard = mutex_guard.check_and_arm()?;
 //!  let state = poison_guard.data_mut();
 //!  state.clean = false;
 //!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
 //!  tokio::time::sleep(Duration::from_secs(10)).await;
 //!  state.clean = true;
 //!  poison_guard.disarm();
 //!  # Ok::<(), utils::poison::Error>(())
 //!  # });
 //!  ```
 use tracing::warn;
 pub struct Poison<T> {
    what: &'static str,
    state: State,
    data: T,
 }
 #[derive(Clone, Copy)]
 enum State {
    Clean,
    Armed,
    Poisoned { at: chrono::DateTime<chrono::Utc> },
 }
 impl<T> Poison<T> {
    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
    pub fn new(what: &'static str, data: T) -> Self {
        Self {
            what,
            state: State::Clean,
            data,
        }
    }
    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
        match self.state {
            State::Clean => {
                self.state = State::Armed;
                Ok(Guard(self))
            }
            State::Armed => unreachable!("transient state"),
            State::Poisoned { at } => Err(Error::Poisoned {
                what: self.what,
                at,
            }),
        }
    }
 }
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);
 impl<'a, T> Guard<'a, T> {
    pub fn data(&self) -> &T {
        &self.0.data
    }
    pub fn data_mut(&mut self) -> &mut T {
        &mut self.0.data
    }
    pub fn disarm(self) {
        match self.0.state {
            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
            State::Armed => {
                self.0.state = State::Clean;
            }
            State::Poisoned { at } => {
                unreachable!("we fail check_and_arm() if it's in that state: {at}")
            }
        }
    }
 }
 impl<'a, T> Drop for Guard<'a, T> {
    fn drop(&mut self) {
        match self.0.state {
            State::Clean => {
                // set by disarm()
            }
            State::Armed => {
                // still armed => poison it
                let at = chrono::Utc::now();
                self.0.state = State::Poisoned { at };
                warn!(at=?at, "poisoning {}", self.0.what);
            }
            State::Poisoned { at } => {
                unreachable!("we fail check_and_arm() if it's in that state: {at}")
            }
        }
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
    #[error("poisoned at {at}: {what}")]
    Poisoned {
        what: &'static str,
        at: chrono::DateTime<chrono::Utc>,
    },
 }
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,6 +182,18 @@ where
        }
    }
    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
        let internal = self.internal.lock().unwrap();
        let cnt = internal.current.cnt_value();
        drop(internal);
        if cnt >= num {
            Ok(())
        } else {
            Err(cnt)
        }
    }
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/utils/src/serde_system_time.rs
+++ b/libs/utils/src/serde_system_time.rs
@@ -0,0 +1,55 @@
 //! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct SystemTime(
    #[serde(
        deserialize_with = "deser_rfc3339_millis",
        serialize_with = "ser_rfc3339_millis"
    )]
    pub std::time::SystemTime,
 );
 fn ser_rfc3339_millis<S: serde::ser::Serializer>(
    ts: &std::time::SystemTime,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
            Ok(duration) => {
                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
                SystemTime(
                    std::time::SystemTime::UNIX_EPOCH
                        + std::time::Duration::from_millis(total_millis),
                )
            }
            Err(_) => time,
        }
    }
    #[test]
    fn test_serialize_deserialize() {
        let input = SystemTime(std::time::SystemTime::now());
        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
        let serialized = serde_json::to_string(&input).unwrap();
        assert_eq!(expected_serialized, serialized);
        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
        assert_eq!(to_millisecond_precision(input), deserialized);
    }
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
        }
    }
    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
    /// initialized.
    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
        let inner = self.inner.get_mut().unwrap();
        inner.take_and_deinit()
    }
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
        self.0
            .take_and_deinit()
            .expect("guard is not created unless value has been initialized")
    }
 }
 impl<T> Inner<T> {
    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
        let value = self.value.take()?;
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
        sem.try_acquire().expect("we just created this").forget();
-        std::mem::swap(&mut *self.0, &mut swapped);
+        let permit = InitPermit(sem);
-        swapped
+        std::mem::swap(self, &mut swapped);
-            .value
+        Some((value, permit))
            .map(|v| (v, InitPermit(sem)))
            .expect("guard is not created unless value has been initialized")
    }
 }
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);
 impl std::fmt::Debug for InitPermit {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let ptr = Arc::as_ptr(&self.0) as *const ();
        f.debug_tuple("InitPermit").field(&ptr).finish()
    }
 }
 impl Drop for InitPermit {
    fn drop(&mut self) {
        assert_eq!(
@@ -559,4 +582,22 @@ mod tests {
        assert_eq!(*target.get().unwrap(), 11);
    }
    #[tokio::test]
    async fn take_and_deinit_on_mut() {
        use std::convert::Infallible;
        let mut target = OnceCell::<u32>::default();
        assert!(target.take_and_deinit().is_none());
        target
            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
            .await
            .unwrap();
        let again = target.take_and_deinit();
        assert!(matches!(again, Some((42, _))), "{again:?}");
        assert!(target.take_and_deinit().is_none());
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
 twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,30 +27,50 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-03-20 on i3en.3xlarge
+//! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [26.483 µs 26.614 µs 26.767 µs]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2                 time:   [32.223 µs 32.465 µs 32.767 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4                 time:   [47.203 µs 47.583 µs 47.984 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8                 time:   [89.135 µs 89.612 µs 90.139 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16                time:   [190.12 µs 191.52 µs 192.88 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32                time:   [380.96 µs 382.63 µs 384.20 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64                time:   [736.86 µs 741.07 µs 745.03 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128               time:   [1.4106 ms 1.4206 ms 1.4294 ms]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1                time:   [111.81 µs 112.25 µs 112.79 µs]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2                time:   [158.26 µs 159.13 µs 160.21 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4                time:   [334.65 µs 337.14 µs 340.07 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8                time:   [675.32 µs 679.91 µs 685.25 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16               time:   [1.2929 ms 1.2996 ms 1.3067 ms]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32               time:   [2.4295 ms 2.4461 ms 2.4623 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64               time:   [4.3973 ms 4.4458 ms 4.4875 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128              time:   [7.5955 ms 7.7847 ms 7.9481 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
 //! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
 //! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
 //! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
 //! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
 //! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
 //! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
 //! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
 //! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
 //! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
 //! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
 //! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
 //! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
 //! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
 //! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
 //! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
    config::PageServerConf,
    walrecord::NeonWalRecord,
    walredo::{PostgresRedoManager, ProcessKind},
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 fn bench(c: &mut Criterion) {
-    {
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        {
-        for nclients in nclients {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            let mut group = c.benchmark_group("short");
+            for nclients in nclients {
-            group.bench_with_input(
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                BenchmarkId::from_parameter(nclients),
+                group.bench_with_input(
-                &nclients,
+                    BenchmarkId::from_parameter(nclients),
-                |b, nclients| {
+                    &nclients,
-                    let redo_work = Arc::new(Request::short_input());
+                    |b, nclients| {
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                        let redo_work = Arc::new(Request::short_input());
-                },
+                        b.iter_custom(|iters| {
-            );
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
                        });
                    },
                );
            }
        }
    }
-    {
+        {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
+            for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-            group.bench_with_input(
+                group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
+                    BenchmarkId::from_parameter(nclients),
-                &nclients,
+                    &nclients,
-                |b, nclients| {
+                    |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
+                        let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                        b.iter_custom(|iters| {
-                },
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-            );
+                        });
                    },
                );
            }
        }
    }
 }
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
    process_kind: ProcessKind,
    redo_work: Arc<Request>,
    n_redos: u64,
    nclients: u64,
 ) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    conf.walredo_process_kind = process_kind;
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
    let manager = Arc::new(manager);
    // divide the amount of work equally among the clients.
    let nredos_per_client = n_redos / nclients;
    for _ in 0..nclients {
        rt.block_on(async {
            tasks.spawn(client(
                Arc::clone(&manager),
                Arc::clone(&start),
                Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
+                nredos_per_client,
                n_redos / nclients,
            ))
        });
    }
-    rt.block_on(async move {
+    let elapsed = rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+        let mut total_wallclock_time = Duration::ZERO;
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    })
+    });
    // consistency check to ensure process kind setting worked
    if nredos_per_client > 0 {
        assert_eq!(
            manager
                .status()
                .process
                .map(|p| p.kind)
                .expect("the benchmark work causes a walredo process to be spawned"),
            std::borrow::Cow::Borrowed(process_kind.into())
        );
    }
    elapsed
 }
 async fn client(
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -128,12 +128,12 @@ impl Client {
    pub async fn timeline_info(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );
@@ -151,11 +151,11 @@ impl Client {
    pub async fn keyspace(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<pageserver_api::models::partitioning::Partitioning> {
        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
            self.mgmt_api_endpoint
        );
        self.get(&uri)
@@ -279,7 +279,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
+            tenant_id: None,
            config,
        };
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 anyhow.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 chrono = { workspace = true, features = ["serde"] }
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -43,7 +43,8 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
    fanout: u64,
    ctx: &E::RequestContext,
 ) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
+    assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}");
    let exp_base = fanout.max(2);
    // Start at L0
    let mut current_level_no = 0;
    let mut current_level_target_height = target_file_size;
@@ -106,7 +107,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            break;
        }
        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
+        current_level_target_height = current_level_target_height.saturating_mul(exp_base);
    }
    Ok(())
 }
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -180,7 +180,7 @@ where
                match top.deref_mut() {
                    LazyLoadLayer::Unloaded(ref mut l) => {
                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(fut));
+                        this.load_future.set(Some(Box::pin(fut)));
                        continue;
                    }
                    LazyLoadLayer::Loaded(ref mut entries) => {
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -3,7 +3,6 @@
 //!
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use async_trait::async_trait;
 use futures::Future;
 use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
@@ -141,18 +140,16 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
    fn is_delta(&self) -> bool;
 }
 #[async_trait]
 pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
    where
        Self: 'a;
    /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
+    fn load_keys<'a>(
        &self,
        ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
+    ) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
 }
 pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -2,7 +2,6 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 use async_trait::async_trait;
 use futures::StreamExt;
 use rand::Rng;
 use tracing::info;
@@ -139,7 +138,6 @@ impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
    }
 }
 #[async_trait]
 impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
    type DeltaEntry<'a> = MockRecord;
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -12,9 +12,14 @@ bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 humantime.workspace = true
 pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
 //! sources; see the Usage section below for examples.
 //!
 //! # Usage
 //!
 //! ## Producing the SVG
 //!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//! # local timeline dir
-//! $ firefox out.svg
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //!
 //! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
 //! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
 //!
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! ## Viewing
 //! or from pageserver log files.
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
+//! **Inkscape** is better than the built-in viewers in browsers.
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//!
 //! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
 //! to see the layer file name in the comment field.
 //!
 //! ```bash
 //!
 //! # Linux
 //! inkscape out.svg
 //!
 //! # macOS
 //! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
 //!
 //! ```
 //!
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
        filename: String,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
    }
    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let range = parse_filename(filename);
+        let (key_range, lsn_range) = parse_filename(filename);
-        ranges.push(range);
+        files.push(Layer {
            filename: filename.to_owned(),
            key_range,
            lsn_range,
        });
    }
    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
        key_range: keyr,
        lsn_range: lsnr,
        ..
    } in &files
    {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
        filename,
        key_range: keyr,
        lsn_range: lsnr,
    } in &files
    {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,6 +9,11 @@ mod index_part;
 mod layer_map_analyzer;
 mod layers;
 use std::{
    str::FromStr,
    time::{Duration, SystemTime},
 };
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
@@ -20,8 +25,16 @@ use pageserver::{
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
    virtual_file,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
+use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
 use utils::{
    id::TimelineId,
    logging::{self, LogFormat, TracingErrorLayerEnablement},
    lsn::Lsn,
    project_git_version,
 };
 project_git_version!(GIT_VERSION);
@@ -43,6 +56,7 @@ enum Commands {
    #[command(subcommand)]
    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
    TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
    #[command(subcommand)]
@@ -68,6 +82,26 @@ struct PrintLayerFileCmd {
    path: Utf8PathBuf,
 }
 /// Roll back the time for the specified prefix using S3 history.
 ///
 /// The command is fairly low level and powerful. Validation is only very light,
 /// so it is more powerful, and thus potentially more dangerous.
 #[derive(Parser)]
 struct TimeTravelRemotePrefixCmd {
    /// A configuration string for the remote_storage configuration.
    ///
    /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }`
    config_toml_str: String,
    /// remote prefix to time travel recover. For safety reasons, we require it to contain
    /// a timeline or tenant ID in the prefix.
    prefix: String,
    /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy.
    travel_to: String,
    /// Timestamp of the start of the operation, must be after any changes we want to roll back and after.
    /// You can use a few seconds before invoking the command. Same format as `travel_to`.
    done_if_after: Option<String>,
 }
 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
    /// Pageserver data path
@@ -78,6 +112,14 @@ struct AnalyzeLayerMapCmd {
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
        TracingErrorLayerEnablement::EnableWithRustLogFilter,
        logging::Output::Stdout,
    )?;
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
    let cli = CliOpts::parse();
    match cli.command {
@@ -105,6 +147,42 @@ async fn main() -> anyhow::Result<()> {
                print_layerfile(&cmd.path).await?;
            }
        }
        Commands::TimeTravelRemotePrefix(cmd) => {
            let timestamp = humantime::parse_rfc3339(&cmd.travel_to)
                .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?;
            let done_if_after = if let Some(done_if_after) = &cmd.done_if_after {
                humantime::parse_rfc3339(done_if_after).map_err(|_e| {
                    anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after)
                })?
            } else {
                const SAFETY_MARGIN: Duration = Duration::from_secs(3);
                tokio::time::sleep(SAFETY_MARGIN).await;
                // Convert to string representation and back to get rid of sub-second values
                let done_if_after = SystemTime::now();
                tokio::time::sleep(SAFETY_MARGIN).await;
                done_if_after
            };
            let timestamp = strip_subsecond(timestamp);
            let done_if_after = strip_subsecond(done_if_after);
            let Some(prefix) = validate_prefix(&cmd.prefix) else {
                println!("specified prefix '{}' failed validation", cmd.prefix);
                return Ok(());
            };
            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
            let storage = remote_storage::GenericRemoteStorage::from_config(&config);
            let cancel = CancellationToken::new();
            storage
                .unwrap()
                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                .await?;
        }
    };
    Ok(())
 }
@@ -185,3 +263,89 @@ fn handle_metadata(
    Ok(())
 }
 /// Ensures that the given S3 prefix is sufficiently constrained.
 /// The command is very risky already and we don't want to expose something
 /// that allows usually unintentional and quite catastrophic time travel of
 /// an entire bucket, which would be a major catastrophy and away
 /// by only one character change (similar to "rm -r /home /username/foobar").
 fn validate_prefix(prefix: &str) -> Option<RemotePath> {
    if prefix.is_empty() {
        // Empty prefix means we want to specify the *whole* bucket
        return None;
    }
    let components = prefix.split('/').collect::<Vec<_>>();
    let (last, components) = {
        let last = components.last()?;
        if last.is_empty() {
            (
                components.iter().nth_back(1)?,
                &components[..(components.len() - 1)],
            )
        } else {
            (last, &components[..])
        }
    };
    'valid: {
        if let Ok(_timeline_id) = TimelineId::from_str(last) {
            // Ends in either a tenant or timeline ID
            break 'valid;
        }
        if *last == "timelines" {
            if let Some(before_last) = components.iter().nth_back(1) {
                if let Ok(_tenant_id) = TenantShardId::from_str(before_last) {
                    // Has a valid tenant id
                    break 'valid;
                }
            }
        }
        return None;
    }
    RemotePath::from_string(prefix).ok()
 }
 fn strip_subsecond(timestamp: SystemTime) -> SystemTime {
    let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string();
    humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp")
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_validate_prefix() {
        assert_eq!(validate_prefix(""), None);
        assert_eq!(validate_prefix("/"), None);
        #[track_caller]
        fn assert_valid(prefix: &str) {
            let remote_path = RemotePath::from_string(prefix).unwrap();
            assert_eq!(validate_prefix(prefix), Some(remote_path));
        }
        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/");
        // Path is not relative but absolute
        assert_eq!(
            validate_prefix(
                "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"
            ),
            None
        );
        assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/");
        // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix
        assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None);
        assert_eq!(validate_prefix("wal"), None);
        assert_eq!(validate_prefix("/wal/"), None);
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001");
        // Partial tenant ID
        assert_eq!(
            validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"),
            None
        );
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines");
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines");
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/");
        assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683");
        assert_eq!(validate_prefix("pageserver/v1/tenants/"), None);
    }
 }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,4 +1,5 @@
 use anyhow::Context;
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
@@ -95,7 +96,7 @@ async fn main_impl(
            let timeline = *timeline;
            let info = mgmt_api_client
                .timeline_info(
-                    timeline.tenant_id,
+                    TenantShardId::unsharded(timeline.tenant_id),
                    timeline.timeline_id,
                    ForceAwaitLogicalSize::No,
                )
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -173,7 +174,10 @@ async fn main_impl(
                let timeline = *timeline;
                async move {
                    let partitioning = mgmt_api_client
-                        .keyspace(timeline.tenant_id, timeline.timeline_id)
+                        .keyspace(
                            TenantShardId::unsharded(timeline.tenant_id),
                            timeline.timeline_id,
                        )
                        .await?;
                    let lsn = partitioning.at_lsn;
                    let start = Instant::now();
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 use humantime::Duration;
 use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;
@@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                .timeline_info(
                    TenantShardId::unsharded(tl.tenant_id),
                    tl.timeline_id,
                    ForceAwaitLogicalSize::Yes,
                )
                .await
                .unwrap();
@@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
+                        .timeline_info(
                            TenantShardId::unsharded(tl.tenant_id),
                            tl.timeline_id,
                            ForceAwaitLogicalSize::Yes,
                        )
                        .await
                        .unwrap();
                }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 /// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
    let mut key = [0; METADATA_KEY_SIZE];
    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
    key[0] = AUX_KEY_PREFIX;
    key[1] = dir_level1;
    key[2] = dir_level2;
    key[3..16].copy_from_slice(&hash[0..13]);
    Key::from_metadata_key_fixed_size(&key)
 }
 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 /// Encode the aux file into a fixed-size key.
 ///
 /// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
 /// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
 /// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
 /// is roughly based on the first two components of the path, one unique number for one component.
 ///
 /// * pg_logical/mappings -> 0x0101
 /// * pg_logical/snapshots -> 0x0102
 /// * pg_logical/replorigin_checkpoint -> 0x0103
 /// * pg_logical/others -> 0x01FF
 /// * pg_replslot/ -> 0x0201
 /// * others -> 0xFFFF
 ///
 /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
 /// The new file type must have never been written to the storage before. Otherwise, there could be data
 /// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
 pub fn encode_aux_file_key(path: &str) -> Key {
    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
    } else if path == "pg_logical/replorigin_checkpoint" {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
        if cfg!(debug_assertions) {
            warn!(
                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
                path
            );
        }
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
    } else {
        if cfg!(debug_assertions) {
            warn!(
                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
                path
            );
        }
        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_hash_portable() {
        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
        // if the algorithm produces the same hash across different environments.
        assert_eq!(
            305317690835051308206966631765527126151,
            twox_hash::xxh3::hash128("test1".as_bytes())
        );
        assert_eq!(
            85104974691013376326742244813280798847,
            twox_hash::xxh3::hash128("test/test2".as_bytes())
        );
        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
    }
    #[test]
    fn test_encoding_portable() {
        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
        // of the page server.
        assert_eq!(
            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
            encode_aux_file_key("pg_logical/mappings/test1").to_string()
        );
        assert_eq!(
            "820000010239AAC544893139B26F501B97E6",
            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
        );
        assert_eq!(
            "820000010300000000000000000000000000",
            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
        );
        assert_eq!(
            "82000001FF8635AF2134B7266EC5B4189FD6",
            encode_aux_file_key("pg_logical/unsupported").to_string()
        );
        assert_eq!(
            "8200000201772D0E5D71DE14DA86142A1619",
            encode_aux_file_key("pg_replslot/test3").to_string()
        );
        assert_eq!(
            "820000FFFF1866EBEB53B807B26A2416F317",
            encode_aux_file_key("other_file_not_supported").to_string()
        );
    }
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
                        if self
                            .timeline
                            .get_shard_identity()
                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
                        {
                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
                            // recreate.
                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
                            continue;
                        }
                    };
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,6 +18,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tracing::*;
@@ -284,6 +285,7 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -671,42 +673,37 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
    // All started up! Now just sit and wait for shutdown signal.
    {
        use signal_hook::consts::*;
        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
            let mut signals =
                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
            return signals
                .forever()
                .next()
                .expect("forever() never returns None unless explicitly closed");
        });
        let signal = BACKGROUND_RUNTIME
            .block_on(signal_handler)
            .expect("join error");
        match signal {
            SIGQUIT => {
                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
                std::process::exit(111);
            }
            SIGINT | SIGTERM => {
                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-                // This cancels the `shutdown_pageserver` cancellation tree.
+    {
-                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+        BACKGROUND_RUNTIME.block_on(async move {
-                // The plan is to change that over time.
+            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-                shutdown_pageserver.take();
+            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-                let bg_remote_storage = remote_storage.clone();
+            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-                let bg_deletion_queue = deletion_queue.clone();
+            let signal = tokio::select! {
-                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                _ = sigquit.recv() => {
-                    &tenant_manager,
+                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    std::process::exit(111);
-                    0,
+                }
-                ));
+                _ = sigint.recv() => { "SIGINT" },
-                unreachable!()
+                _ = sigterm.recv() => { "SIGTERM" },
-            }
+            };
-            _ => unreachable!(),
+
-        }
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
            // This cancels the `shutdown_pageserver` cancellation tree.
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
            let bg_remote_storage = remote_storage.clone();
            let bg_deletion_queue = deletion_queue.clone();
            pageserver::shutdown_pageserver(
                &tenant_manager,
                bg_remote_storage.map(|_| bg_deletion_queue),
                0,
            )
            .await;
            unreachable!()
        })
    }
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -97,6 +97,8 @@ pub mod defaults {
    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
    ///
    /// Default built-in configuration file.
    ///
@@ -140,6 +142,8 @@ pub mod defaults {
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
 #walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -290,6 +294,8 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
    pub walredo_process_kind: crate::walredo::ProcessKind,
 }
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }
 impl PageServerConfigBuilder {
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }
    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
        self.walredo_process_kind = BuilderValue::Set(value);
    }
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1032,6 +1047,9 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
                "walredo_process_kind" => {
                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1114,6 +1132,7 @@ impl PageServerConf {
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }
-            if !tenant_shard_id.is_zero() {
+            if !tenant_shard_id.is_shard_zero() {
                // We only send consumption metrics from shard 0, so don't waste time calculating
                // synthetic size on other shards.
                continue;
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
    };
    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active || !id.is_shard_zero() {
            None
        } else {
            tenant_manager
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -12,7 +12,7 @@ use pageserver_api::{
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, generation::Generation, id::NodeId};
+use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
 use crate::{
    config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };
-        fail::fail_point!("control-plane-client-validate");
+        failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
        if self.cancel.is_cancelled() {
            return Err(RetryForeverError::ShuttingDown);
        }
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,24 +58,6 @@ paths:
      responses:
        "200":
          description: The reload completed successfully.
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error (also hits if no keys were found)
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}:
    parameters:
@@ -93,62 +75,14 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantInfo"
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
    delete:
      description: |
        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
          description: Error when no tenant id found in path
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Tenant not found
+          description: Tenant not found. This is the success path.
          content:
            application/json:
              schema:
@@ -165,18 +99,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -206,36 +128,6 @@ paths:
            application/json:
              schema:
                type: string
        "400":
          description: Error when no tenant id found in path or invalid timestamp
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -255,36 +147,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TimelineInfo"
        "400":
          description: Error when no tenant id found in path
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -309,60 +171,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
        "400":
          description: Error when no tenant id found in path or no timeline id
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Timeline not found
+          description: Timeline not found. This is the success path.
          content:
            application/json:
              schema:
@@ -379,18 +193,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
    parameters:
@@ -423,36 +225,6 @@ paths:
              schema:
                type: string
                format: date-time
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "404":
          description: Timeline not found, or there is no timestamp information for the given lsn
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -484,36 +256,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
@@ -537,36 +279,6 @@ paths:
            application/json:
              schema:
                type: string
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -628,24 +340,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantLocationConfigResponse"
        "503":
          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: |
            The tenant is already known to Pageserver in some way,
@@ -662,12 +356,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/ignore:
    parameters:
      - name: tenant_id
@@ -684,36 +372,6 @@ paths:
      responses:
        "200":
          description: Tenant ignored
        "400":
          description: Error when no tenant id found in path parameters
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/load:
@@ -740,36 +398,6 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
        "400":
          description: Error when no tenant id found in path parameters
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
@@ -790,37 +418,6 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
        "404":
          description: No tenant or timeline found for the specified ids
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -839,31 +436,8 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  # This route has no handler. TODO: remove?
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -945,18 +519,6 @@ paths:
      responses:
        "200":
          description: Success
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/secondary/download:
    parameters:
@@ -987,20 +549,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -1043,24 +591,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
        "400":
          description: Malformed timeline create request
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "406":
          description: Permanently unsatisfiable request, don't retry.
          content:
@@ -1079,18 +609,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/:
    get:
@@ -1104,30 +622,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
    post:
      description: |
@@ -1148,43 +642,12 @@ paths:
            application/json:
              schema:
                type: string
        "400":
          description: Malformed tenant create request
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: Tenant already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/config:
    put:
@@ -1206,36 +669,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
        "400":
          description: Malformed tenant config request
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/config/:
    parameters:
@@ -1255,42 +688,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantConfigResponse"
        "400":
          description: Malformed get tenanant config request
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "404":
          description: Tenand or timeline were not found
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: Temporarily unavailable, please retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/utilization:
    get:
@@ -1304,12 +701,6 @@ paths:
              application/json:
                schema:
                  $ref: "#/components/schemas/PageserverUtilization"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
 components:
  securitySchemes:
@@ -1629,7 +1020,7 @@ components:
          type: integer
          format: int64
          minimum: 0
-          description: The amount of disk space currently utilized by layer files.
+          description: The amount of disk space currently used.
        free_space_bytes:
          type: integer
          format: int64
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
            PageReconstructError::MissingKey(e) => {
                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -457,8 +460,12 @@ async fn reload_auth_validation_keys_handler(
            json_response(StatusCode::OK, ())
        }
        Err(e) => {
            let err_msg = "Error reloading public keys";
            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+            json_response(
                StatusCode::INTERNAL_SERVER_ERROR,
                HttpErrorBody::from_msg(err_msg.to_string()),
            )
        }
    }
 }
@@ -696,7 +703,7 @@ async fn get_lsn_by_timestamp_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -747,7 +754,7 @@ async fn get_timestamp_of_lsn_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -772,7 +779,9 @@ async fn get_timestamp_of_lsn_handler(
            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
            json_response(StatusCode::OK, time)
        }
-        None => json_response(StatusCode::NOT_FOUND, ()),
+        None => Err(ApiError::NotFound(
            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
        )),
    }
 }
@@ -993,11 +1002,26 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);
    // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting.
    let activate = true;
    #[cfg(feature = "testing")]
    let activate = parse_query_param(&request, "activate")?.unwrap_or(activate);
    let tenant_info = async {
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        if activate {
            // This is advisory: we prefer to let the tenant activate on-demand when this function is
            // called, but it is still valid to return 200 and describe the current state of the tenant
            // if it doesn't make it into an active state.
            tenant
                .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
                .await
                .ok();
        }
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
        for timeline in tenant.list_timelines().iter() {
@@ -1071,7 +1095,7 @@ async fn tenant_size_handler(
    let headers = request.headers();
    let state = get_state(&request);
-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
@@ -170,7 +171,10 @@ async fn import_rel(
        let r = reader.read_exact(&mut buf).await;
        match r {
            Ok(_) => {
-                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                let key = rel_block_to_key(rel, blknum);
                if modification.tline.get_shard_identity().is_key_local(&key) {
                    modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
                }
            }
            // TODO: UnexpectedEof is expected
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_read_num_fs_layers",
+        "pageserver_layers_visited_per_read_global",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        "Number of layers visited to reconstruct one key",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_layers_visited_per_vectored_read_global",
        "Average number of layers visited to reconstruct one key",
        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });
@@ -1483,12 +1492,18 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 });
 pub(crate) struct WalIngestMetrics {
    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
        "Bytes of WAL ingested from safekeepers",
    )
    .unwrap(),
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
@@ -1512,7 +1527,8 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) download_heatmap: IntCounter,
    pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
    SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
        "pageserver_secondary_upload_heatmap",
        "Number of heatmaps written to remote storage by attached tenants"
@@ -1530,7 +1546,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
+        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
    )
    .expect("failed to define a metric"),
    download_layer: register_int_counter!(
@@ -1538,6 +1554,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Number of downloads of layers by secondary mode locations"
    )
    .expect("failed to define a metric"),
 }
 });
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1813,6 +1830,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);
 #[cfg(not(test))]
 pub mod wal_redo {
    use super::*;
    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
        std::sync::Mutex::new(
            register_uint_gauge_vec!(
                "pageserver_wal_redo_process_kind",
                "The configured process kind for walredo",
                &["kind"],
            )
            .unwrap(),
        )
    });
    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
        // use guard to avoid races around the next two steps
        let guard = PROCESS_KIND.lock().unwrap();
        guard.reset();
        guard.with_label_values(&[&format!("{kind}")]).set(1);
    }
 }
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -2083,7 +2123,7 @@ impl TimelineMetrics {
 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
    // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
@@ -2094,6 +2134,7 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
 use futures::Future;
 use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
@@ -2663,6 +2704,26 @@ pub(crate) mod disk_usage_based_eviction {
    pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
 }
 static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tokio_executor_thread_configured_count",
        "Total number of configued tokio executor threads in the process.
         The `setup` label denotes whether we're running with multiple runtimes or a single runtime.",
        &["setup"],
    )
    .unwrap()
 });
 pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
    let _guard = SERIALIZE.lock().unwrap();
    TOKIO_EXECUTOR_THREAD_COUNT.reset();
    TOKIO_EXECUTOR_THREAD_COUNT
        .get_metric_with_label_values(&[setup])
        .unwrap()
        .set(u64::try_from(num_threads.get()).unwrap());
 }
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
@@ -2719,7 +2780,8 @@ pub fn preinitialize_metrics() {
    // histograms
    [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -874,9 +874,20 @@ impl PageServerHandler {
            // walsender completes the authentication and starts streaming the
            // WAL.
            if lsn <= last_record_lsn {
                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
                // last_record_lsn. That would give the same result, since we know
                // that there haven't been modifications since 'lsn'. Using an older
                // LSN might be faster, because that could allow skipping recent
                // layers when finding the page.
                lsn = last_record_lsn;
            } else {
-                timeline.wait_lsn(lsn, ctx).await?;
+                timeline
                    .wait_lsn(
                        lsn,
                        crate::tenant::timeline::WaitLsnWaiter::PageService,
                        ctx,
                    )
                    .await?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -888,7 +899,13 @@ impl PageServerHandler {
                    "invalid LSN(0) in request".into(),
                ));
            }
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
                .wait_lsn(
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
        }
        if lsn < **latest_gc_cutoff_lsn {
@@ -1189,6 +1206,10 @@ impl PageServerHandler {
        ))
    }
    /// Note on "fullbackup":
    /// Full basebackups should only be used for debugging purposes.
    /// Originally, it was introduced to enable breaking storage format changes,
    /// but that is not applicable anymore.
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
@@ -1215,7 +1236,13 @@ impl PageServerHandler {
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn, ctx).await?;
+            timeline
                .wait_lsn(
                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                .context("invalid basebackup lsn")?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -252,16 +252,8 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();
-        if latest {
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-            // Update relation size cache only if "latest" flag is set.
+
            // This flag is set by compute when it is working with most recent version of relation.
            // Typically master compute node always set latest=true.
            // Please notice, that even if compute node "by mistake" specifies old LSN but set
            // latest=true, then it can not cause cache corruption, because with latest=true
            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
            // associated with most recent value of LSN.
            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
        }
        Ok(nblocks)
    }
@@ -456,6 +448,11 @@ impl Timeline {
        // include physical changes from later commits that will be marked
        // as aborted, and will need to be vacuumed away.
        let commit_lsn = Lsn((low - 1) * 8);
        // This maxing operation is for the edge case that the search above did
        // set found_smaller to true but it never increased the lsn. Then, low
        // is still the old min_lsn the subtraction above could possibly give a value
        // below the anchestor_lsn.
        let commit_lsn = commit_lsn.max(min_lsn);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
@@ -817,7 +814,7 @@ impl Timeline {
    /// Get cached size of relation if it not updated after specified LSN
    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
                return Some(*nblocks);
            }
@@ -828,7 +825,16 @@ impl Timeline {
    /// Update cached relation size if there is no more recent update
    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
        if lsn < rel_size_cache.complete_as_of {
            // Do not cache old values. It's safe to cache the size on read, as long as
            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
            // never evict values from the cache, so if the relation size changed after
            // 'lsn', the new value is already in the cache.
            return;
        }
        match rel_size_cache.map.entry(tag) {
            hash_map::Entry::Occupied(mut entry) => {
                let cached_lsn = entry.get_mut();
                if lsn >= cached_lsn.0 {
@@ -844,13 +850,13 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
    }
    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
    }
 }
@@ -1401,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
        let n_files;
        let mut aux_files = self.tline.aux_files.lock().await;
        if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
            dir.upsert(file_path.clone(), content.clone());
            n_files = dir.files.len();
            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
@@ -1446,10 +1452,14 @@ impl<'a> DatadirModification<'a> {
                    // reset the map.
                    return Err(e.into());
                }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                // other error occurs, we may incorrectly reset the map of aux files.
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                Err(
                    PageReconstructError::Other(_)
                    | PageReconstructError::WalRedo(_)
                    | PageReconstructError::MissingKey { .. },
                ) => {
                    // Key is missing, we must insert an image as the basis for subsequent deltas.
                    let mut dir = AuxFilesDirectory {
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,52 @@ impl Value {
    }
 }
 #[cfg(test)]
 #[derive(Debug, PartialEq)]
 pub(crate) enum InvalidInput {
    TooShortValue,
    TooShortPostgresRecord,
 }
 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
 #[cfg(test)]
 pub(crate) struct ValueBytes;
 #[cfg(test)]
 impl ValueBytes {
    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
        if raw.len() < 12 {
            return Err(InvalidInput::TooShortValue);
        }
        let value_discriminator = &raw[0..4];
        if value_discriminator == [0, 0, 0, 0] {
            // Value::Image always initializes
            return Ok(true);
        }
        if value_discriminator != [0, 0, 0, 1] {
            // not a Value::WalRecord(..)
            return Ok(false);
        }
        let walrecord_discriminator = &raw[4..8];
        if walrecord_discriminator != [0, 0, 0, 0] {
            // only NeonWalRecord::Postgres can have will_init
            return Ok(false);
        }
        if raw.len() < 17 {
            return Err(InvalidInput::TooShortPostgresRecord);
        }
        Ok(raw[8] == 1)
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
@@ -70,6 +116,8 @@ mod test {
        ];
        roundtrip!(image, expected);
        assert!(ValueBytes::will_init(&expected).unwrap());
    }
    #[test]
@@ -93,6 +141,96 @@ mod test {
        ];
        roundtrip!(rec, expected);
        assert!(ValueBytes::will_init(&expected).unwrap());
    }
    #[test]
    fn bytes_inspection_too_short_image() {
        let rec = Value::Image(Bytes::from_static(b""));
        #[rustfmt::skip]
        let expected = [
            // top level discriminator of 4 bytes
            0x00, 0x00, 0x00, 0x00,
            // 8 byte length
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];
        roundtrip!(rec, expected);
        assert!(ValueBytes::will_init(&expected).unwrap());
        assert_eq!(expected.len(), 12);
        for len in 0..12 {
            assert_eq!(
                ValueBytes::will_init(&expected[..len]).unwrap_err(),
                InvalidInput::TooShortValue
            );
        }
    }
    #[test]
    fn bytes_inspection_too_short_postgres_record() {
        let rec = NeonWalRecord::Postgres {
            will_init: false,
            rec: Bytes::from_static(b""),
        };
        let rec = Value::WalRecord(rec);
        #[rustfmt::skip]
        let expected = [
            // flattened discriminator of total 8 bytes
            0x00, 0x00, 0x00, 0x01,
            0x00, 0x00, 0x00, 0x00,
            // will_init
            0x00,
            // 8 byte length
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        ];
        roundtrip!(rec, expected);
        assert!(!ValueBytes::will_init(&expected).unwrap());
        assert_eq!(expected.len(), 17);
        for len in 12..17 {
            assert_eq!(
                ValueBytes::will_init(&expected[..len]).unwrap_err(),
                InvalidInput::TooShortPostgresRecord
            )
        }
        for len in 0..12 {
            assert_eq!(
                ValueBytes::will_init(&expected[..len]).unwrap_err(),
                InvalidInput::TooShortValue
            )
        }
    }
    #[test]
    fn clear_visibility_map_flags_example() {
        let rec = NeonWalRecord::ClearVisibilityMapFlags {
            new_heap_blkno: Some(0x11),
            old_heap_blkno: None,
            flags: 0x03,
        };
        let rec = Value::WalRecord(rec);
        #[rustfmt::skip]
        let expected = [
            // discriminators
            0x00, 0x00, 0x00, 0x01,
            0x00, 0x00, 0x00, 0x01,
            // Some == 1 followed by 4 bytes
            0x01, 0x00, 0x00, 0x00, 0x11,
            // None == 0
            0x00,
            // flags
            0x03
        ];
        roundtrip!(rec, expected);
        assert!(!ValueBytes::will_init(&expected).unwrap());
    }
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -33,13 +33,14 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::panic::AssertUnwindSafe;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 use futures::FutureExt;
 use pageserver_api::shard::TenantShardId;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
 use tokio_util::sync::CancellationToken;
@@ -48,8 +49,11 @@ use tracing::{debug, error, info, warn};
 use once_cell::sync::Lazy;
 use utils::env;
 use utils::id::TimelineId;
 use crate::metrics::set_tokio_runtime_setup;
 //
 // There are four runtimes:
 //
@@ -98,52 +102,119 @@ use utils::id::TimelineId;
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
 //
 pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("compute request worker")
        .enable_all()
        .build()
        .expect("Failed to create compute request runtime")
 });
-pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
+pub(crate) static TOKIO_WORKER_THREADS: Lazy<NonZeroUsize> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("mgmt request worker")
        .enable_all()
        .build()
        .expect("Failed to create mgmt request runtime")
 });
 pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("walreceiver worker")
        .enable_all()
        .build()
        .expect("Failed to create walreceiver runtime")
 });
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
        // if you change the number of worker threads please change the constant below
        .enable_all()
        .build()
        .expect("Failed to create background op runtime")
 });
 pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
    // force init and thus panics
    let _ = BACKGROUND_RUNTIME.handle();
    // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
    // tokio would had already panicked for parsing errors or NotUnicode
    //
    // this will be wrong if any of the runtimes gets their worker threads configured to something
    // else, but that has not been needed in a long time.
-    std::env::var("TOKIO_WORKER_THREADS")
+    NonZeroUsize::new(
-        .map(|s| s.parse::<usize>().unwrap())
+        std::env::var("TOKIO_WORKER_THREADS")
-        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
+            .map(|s| s.parse::<usize>().unwrap())
            .unwrap_or_else(|_e| usize::max(2, num_cpus::get())),
    )
    .expect("the max() ensures that this is not zero")
 });
 enum TokioRuntimeMode {
    SingleThreaded,
    MultiThreaded { num_workers: NonZeroUsize },
 }
 impl FromStr for TokioRuntimeMode {
    type Err = String;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "current_thread" => Ok(TokioRuntimeMode::SingleThreaded),
            s => match s.strip_prefix("multi_thread:") {
                Some("default") => Ok(TokioRuntimeMode::MultiThreaded {
                    num_workers: *TOKIO_WORKER_THREADS,
                }),
                Some(suffix) => {
                    let num_workers = suffix.parse::<NonZeroUsize>().map_err(|e| {
                        format!(
                            "invalid number of multi-threaded runtime workers ({suffix:?}): {e}",
                        )
                    })?;
                    Ok(TokioRuntimeMode::MultiThreaded { num_workers })
                }
                None => Err(format!("invalid runtime config: {s:?}")),
            },
        }
    }
 }
 static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
    let thread_name = "pageserver-tokio";
    let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
        // If the env var is not set, leave this static as None.
        set_tokio_runtime_setup(
            "multiple-runtimes",
            NUM_MULTIPLE_RUNTIMES
                .checked_mul(*TOKIO_WORKER_THREADS)
                .unwrap(),
        );
        return None;
    };
    Some(match mode {
        TokioRuntimeMode::SingleThreaded => {
            set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap());
            tokio::runtime::Builder::new_current_thread()
                .thread_name(thread_name)
                .enable_all()
                .build()
                .expect("failed to create one single runtime")
        }
        TokioRuntimeMode::MultiThreaded { num_workers } => {
            set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers);
            tokio::runtime::Builder::new_multi_thread()
                .thread_name(thread_name)
                .enable_all()
                .worker_threads(num_workers.get())
                .build()
                .expect("failed to create one multi-threaded runtime")
        }
    })
 });
 /// Declare a lazy static variable named `$varname` that will resolve
 /// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME`
 /// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation
 /// declares a separate runtime and the lazy static variable `$varname`
 /// will resolve to that separate runtime.
 ///
 /// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if
 /// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime
 /// otherwise.
 macro_rules! pageserver_runtime {
    ($varname:ident, $name:literal) => {
        pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| {
            if let Some(runtime) = &*ONE_RUNTIME {
                return runtime;
            }
            static RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
                tokio::runtime::Builder::new_multi_thread()
                    .thread_name($name)
                    .worker_threads(TOKIO_WORKER_THREADS.get())
                    .enable_all()
                    .build()
                    .expect(std::concat!("Failed to create runtime ", $name))
            });
            &*RUNTIME
        });
    };
 }
 pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
 pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
 pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
 pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
 // Bump this number when adding a new pageserver_runtime!
 // SAFETY: it's obviously correct
 const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
@@ -214,13 +285,12 @@ pub enum TaskKind {
    /// Internally, `Client` hands over requests to the `Connection` object.
    /// The `Connection` object is responsible for speaking the wire protocol.
    ///
-    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// That abstraction doesn't use `task_mgr`.
    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
-    /// Once the connection is established, the `TaskHandle` task creates a
+    /// Once the connection is established, the `TaskHandle` task spawns a
-    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// [`WalReceiverConnectionPoller`] task that is responsible for polling
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
@@ -230,7 +300,6 @@ pub enum TaskKind {
    WalReceiverManager,
    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
    ///
    /// [`WalReceiverManager`]: Self::WalReceiverManager
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!
 use anyhow::{bail, Context};
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
@@ -98,7 +99,7 @@ use std::ops::Bound::Included;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::sync::{Mutex, RwLock};
+use std::sync::Mutex;
 use std::time::{Duration, Instant};
 use crate::span;
@@ -260,7 +261,7 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<AttachedTenantConf>>,
+    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
    tenant_shard_id: TenantShardId,
@@ -385,7 +386,7 @@ impl WalRedoManager {
    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(m) => m.status(),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -558,9 +559,10 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
        }
        timeline
@@ -1515,7 +1517,7 @@ impl Tenant {
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
+                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
@@ -1606,7 +1608,7 @@ impl Tenant {
        );
        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
@@ -1633,7 +1635,7 @@ impl Tenant {
        }
        {
-            let conf = self.tenant_conf.read().unwrap();
+            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(());
@@ -1782,7 +1784,7 @@ impl Tenant {
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
+        shutdown_mode: timeline::ShutdownMode,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
@@ -1829,16 +1831,8 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
-
+                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
-                let span =
+                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
                js.spawn(async move {
                    if freeze_and_flush {
                        timeline.flush_and_shutdown().instrument(span).await
                    } else {
                        timeline.shutdown().instrument(span).await
                    }
                });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -2082,14 +2076,14 @@ impl Tenant {
    }
    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
-        self.tenant_conf.read().unwrap().location.attach_mode
+        self.tenant_conf.load().location.attach_mode
    }
    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
-        let conf = self.tenant_conf.read().unwrap();
+        let conf = self.tenant_conf.load();
        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
@@ -2236,7 +2230,7 @@ where
 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }
    pub fn effective_config(&self) -> TenantConf {
@@ -2245,84 +2239,84 @@ impl Tenant {
    }
    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }
    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }
    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }
    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }
    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }
    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }
    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }
    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }
    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }
    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }
    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }
    pub fn get_heatmap_period(&self) -> Option<Duration> {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
@@ -2334,26 +2328,40 @@ impl Tenant {
    }
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        // Use read-copy-update in order to avoid overwriting the location config
-        self.tenant_conf_updated();
+        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
        self.tenant_conf.rcu(|inner| {
            Arc::new(AttachedTenantConf {
                tenant_conf: new_tenant_conf.clone(),
                location: inner.location,
            })
        });
        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }
    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
-        *self.tenant_conf.write().unwrap() = new_conf;
+        let new_tenant_conf = new_conf.tenant_conf.clone();
-        self.tenant_conf_updated();
+
        self.tenant_conf.store(Arc::new(new_conf));
        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated();
+            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }
@@ -2367,11 +2375,8 @@ impl Tenant {
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }
-    pub(crate) fn tenant_conf_updated(&self) {
+    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = {
+        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
            let guard = self.tenant_conf.read().unwrap();
            Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf)
        };
        self.timeline_get_throttle.reconfigure(conf)
    }
@@ -2519,7 +2524,7 @@ impl Tenant {
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
-            tenant_conf: Arc::new(RwLock::new(attached_conf)),
+            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
        }
    }
@@ -2865,20 +2870,23 @@ impl Tenant {
                }
            }
-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
+            let cutoff = timeline
-                let branchpoints: Vec<Lsn> = all_branchpoints
+                .get_last_record_lsn()
-                    .range((
+                .checked_sub(horizon)
-                        Included((timeline_id, Lsn(0))),
+                .unwrap_or(Lsn(0));
                        Included((timeline_id, Lsn(u64::MAX))),
                    ))
                    .map(|&x| x.1)
                    .collect();
                timeline
                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
                    .await?;
-                gc_timelines.push(timeline);
+            let branchpoints: Vec<Lsn> = all_branchpoints
-            }
+                .range((
                    Included((timeline_id, Lsn(0))),
                    Included((timeline_id, Lsn(u64::MAX))),
                ))
                .map(|&x| x.1)
                .collect();
            timeline
                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
                .await?;
            gc_timelines.push(timeline);
        }
        drop(gc_cs);
        Ok(gc_timelines)
@@ -3023,7 +3031,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }
@@ -3186,7 +3194,7 @@ impl Tenant {
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
            // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
+            if self.tenant_shard_id().is_shard_zero() {
                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                    .await?;
            }
@@ -3433,7 +3441,7 @@ impl Tenant {
            .store(size, Ordering::Relaxed);
        // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
+        debug_assert!(self.shard_identity.is_shard_zero());
        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
@@ -3505,7 +3513,7 @@ impl Tenant {
    }
    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
-        self.tenant_conf.read().unwrap().tenant_conf.clone()
+        self.tenant_conf.load().tenant_conf.clone()
    }
 }
@@ -3653,6 +3661,9 @@ pub(crate) mod harness {
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
            }
        }
    }
@@ -3841,6 +3852,8 @@ pub(crate) mod harness {
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
    use super::*;
    use crate::keyspace::KeySpaceAccum;
    use crate::repository::{Key, Value};
@@ -3849,8 +3862,10 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
    use tests::timeline::{GetVectoredError, ShutdownMode};
    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4296,7 +4311,7 @@ mod tests {
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -4337,7 +4352,7 @@ mod tests {
            // so that all uploads finish & we can call harness.load() below again
            tenant
-                .shutdown(Default::default(), true)
+                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
@@ -4647,6 +4662,62 @@ mod tests {
        Ok(())
    }
    #[tokio::test]
    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();
        let mut modification = tline.begin_modification(Lsn(0x1000));
        modification.put_file("foo/bar1", b"content1", &ctx).await?;
        modification.set_lsn(Lsn(0x1008))?;
        modification.put_file("foo/bar2", b"content2", &ctx).await?;
        modification.commit(&ctx).await?;
        let child_timeline_id = TimelineId::generate();
        tenant
            .branch_timeline_test(
                tline,
                child_timeline_id,
                Some(tline.get_last_record_lsn()),
                &ctx,
            )
            .await?;
        let child_timeline = tenant
            .get_timeline(child_timeline_id, true)
            .expect("Should have the branched timeline");
        let aux_keyspace = KeySpace {
            ranges: vec![NON_INHERITED_RANGE],
        };
        let read_lsn = child_timeline.get_last_record_lsn();
        let vectored_res = child_timeline
            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
            .await;
        child_timeline
            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
            .await;
        let images = vectored_res?;
        let mut key = NON_INHERITED_RANGE.start;
        while key < NON_INHERITED_RANGE.end {
            assert!(matches!(
                images[&key],
                Err(PageReconstructError::MissingKey(_))
            ));
            key = key.next();
        }
        Ok(())
    }
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
@@ -4786,6 +4857,166 @@ mod tests {
        Ok(())
    }
    // Test that vectored get descends into ancestor timelines correctly and
    // does not return an image that's newer than requested.
    //
    // The diagram below ilustrates an interesting case. We have a parent timeline
    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
    // from the child timeline, so the parent timeline must be visited. When advacing into
    // the child timeline, the read path needs to remember what the requested Lsn was in
    // order to avoid returning an image that's too new. The test below constructs such
    // a timeline setup and does a few queries around the Lsn of each page image.
    // ```
    //    LSN
    //     ^
    //     |
    //     |
    // 500 | --------------------------------------> branch point
    // 400 |        X
    // 300 |        X
    // 200 | --------------------------------------> requested lsn
    // 100 |        X
    //     |---------------------------------------> Key
    //              |
    //              ------> requested key
    //
    // Legend:
    // * X - page images
    // ```
    #[tokio::test]
    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
        let (tenant, ctx) = harness.load().await;
        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let end_key = start_key.add(1000);
        let child_gap_at_key = start_key.add(500);
        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
        let mut current_lsn = Lsn(0x10);
        let timeline_id = TimelineId::generate();
        let parent_timeline = tenant
            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;
        current_lsn += 0x100;
        for _ in 0..3 {
            let mut key = start_key;
            while key < end_key {
                current_lsn += 0x10;
                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
                let mut writer = parent_timeline.writer().await;
                writer
                    .put(
                        key,
                        current_lsn,
                        &Value::Image(test_img(&image_value)),
                        &ctx,
                    )
                    .await?;
                writer.finish_write(current_lsn);
                if key == child_gap_at_key {
                    parent_gap_lsns.insert(current_lsn, image_value);
                }
                key = key.next();
            }
            parent_timeline.freeze_and_flush().await?;
        }
        let child_timeline_id = TimelineId::generate();
        let child_timeline = tenant
            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
            .await?;
        let mut key = start_key;
        while key < end_key {
            if key == child_gap_at_key {
                key = key.next();
                continue;
            }
            current_lsn += 0x10;
            let mut writer = child_timeline.writer().await;
            writer
                .put(
                    key,
                    current_lsn,
                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(current_lsn);
            key = key.next();
        }
        child_timeline.freeze_and_flush().await?;
        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
        let mut query_lsns = Vec::new();
        for image_lsn in parent_gap_lsns.keys().rev() {
            for offset in lsn_offsets {
                query_lsns.push(Lsn(image_lsn
                    .0
                    .checked_add_signed(offset)
                    .expect("Shouldn't overflow")));
            }
        }
        for query_lsn in query_lsns {
            let results = child_timeline
                .get_vectored_impl(
                    KeySpace {
                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                    },
                    query_lsn,
                    &ctx,
                )
                .await;
            let expected_item = parent_gap_lsns
                .iter()
                .rev()
                .find(|(lsn, _)| **lsn <= query_lsn);
            info!(
                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
                query_lsn, expected_item
            );
            match expected_item {
                Some((_, img_value)) => {
                    let key_results = results.expect("No vectored get error expected");
                    let key_result = &key_results[&child_gap_at_key];
                    let returned_img = key_result
                        .as_ref()
                        .expect("No page reconstruct error expected");
                    info!(
                        "Vectored read at LSN {} returned image {}",
                        query_lsn,
                        std::str::from_utf8(returned_img)?
                    );
                    assert_eq!(*returned_img, test_img(img_value));
                }
                None => {
                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
                }
            }
        }
        Ok(())
    }
    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_random_updates")?;
@@ -5118,7 +5349,7 @@ mod tests {
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown()
+                .shutdown(super::timeline::ShutdownMode::Hard)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,6 +57,9 @@ pub mod defaults {
    // throughputs up to 1GiB/s per timeline.
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -362,6 +365,10 @@ pub struct TenantConf {
    pub lazy_slru_download: bool,
    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
    // How much WAL must be ingested before checking again whether a new image layer is required.
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,
 }
 /// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,
 }
 impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
                .timeline_get_throttle
                .clone()
                .unwrap_or(global_conf.timeline_get_throttle),
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
        }
    }
 }
@@ -548,6 +561,7 @@ impl Default for TenantConf {
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
        }
    }
 }
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -14,7 +14,10 @@ use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    tenant::{
        mgr::{TenantSlot, TenantsMapRemoveResult},
        timeline::ShutdownMode,
    },
 };
 use super::{
@@ -433,6 +436,11 @@ impl DeleteTenantFlow {
        .await
    }
    /// Check whether background deletion of this tenant is currently in progress
    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
        tenant.delete_progress.try_lock().is_err()
    }
    async fn prepare(
        tenant: &Arc<Tenant>,
    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
@@ -463,7 +471,7 @@ impl DeleteTenantFlow {
        // tenant.shutdown
        // Its also bad that we're holding tenants.read here.
        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
+        if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
            return Err(DeleteTenantError::Other(anyhow::anyhow!(
                "tenant shutdown is already in progress"
            )));
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -72,6 +72,10 @@ impl EphemeralFile {
        self.len
    }
    pub(crate) fn id(&self) -> page_cache::FileId {
        self.page_cache_file_id
    }
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -346,35 +346,6 @@ where
    }
 }
 #[derive(PartialEq, Eq, Hash, Debug, Clone)]
 pub enum InMemoryLayerHandle {
    Open {
        lsn_floor: Lsn,
        end_lsn: Lsn,
    },
    Frozen {
        idx: usize,
        lsn_floor: Lsn,
        end_lsn: Lsn,
    },
 }
 impl InMemoryLayerHandle {
    pub fn get_lsn_floor(&self) -> Lsn {
        match self {
            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
        }
    }
    pub fn get_end_lsn(&self) -> Lsn {
        match self {
            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
        }
    }
 }
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -576,41 +547,18 @@ impl LayerMap {
        self.historic.iter()
    }
-    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
    ///
    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
    /// the same exclusive region established by holding the layer manager lock.
    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
    where
        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
    {
        if let Some(open) = &self.open_layer {
            if pred(open) {
-                return Some(InMemoryLayerHandle::Open {
+                return Some(open.clone());
                    lsn_floor: open.get_lsn_range().start,
                    end_lsn: open.get_lsn_range().end,
                });
            }
        }
-        let pos = self.frozen_layers.iter().rev().position(pred);
+        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
        pos.map(|rev_idx| {
            let idx = self.frozen_layers.len() - 1 - rev_idx;
            InMemoryLayerHandle::Frozen {
                idx,
                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
            }
        })
    }
    /// Get the layer pointed to by the provided handle.
    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
        match handle {
            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
        }
    }
    ///
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
        self.body.prev_record_lsn = update.prev_record_lsn;
        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
    }
 }
 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
    }
 }
 /// Parts of the metadata which are regularly modified.
 pub(crate) struct MetadataUpdate {
    disk_consistent_lsn: Lsn,
    prev_record_lsn: Option<Lsn>,
    latest_gc_cutoff_lsn: Lsn,
 }
 impl MetadataUpdate {
    pub(crate) fn new(
        disk_consistent_lsn: Lsn,
        prev_record_lsn: Option<Lsn>,
        latest_gc_cutoff_lsn: Lsn,
    ) -> Self {
        Self {
            disk_consistent_lsn,
            prev_record_lsn,
            latest_gc_cutoff_lsn,
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -677,12 +678,19 @@ pub async fn init_tenant_mgr(
                    }
                }
            }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
+            LocationMode::Secondary(secondary_conf) => {
-                tenant_shard_id,
+                info!(
-                shard_identity,
+                    tenant_id = %tenant_shard_id.tenant_id,
-                location_conf.tenant_conf,
+                    shard_id = %tenant_shard_id.shard_slug(),
-                &secondary_conf,
+                    "Starting secondary tenant"
-            )),
+                );
                TenantSlot::Secondary(SecondaryTenant::new(
                    tenant_shard_id,
                    shard_identity,
                    location_conf.tenant_conf,
                    &secondary_conf,
                ))
            }
        };
        tenants.insert(tenant_shard_id, slot);
@@ -783,11 +791,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                            shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone()));
                            join_set.spawn(
                                async move {
                                    let freeze_and_flush = true;
                                    let res = {
                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                        t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await
                                    };
                                    if let Err(other_progress) = res {
@@ -1107,7 +1113,7 @@ impl TenantManager {
                };
                info!("Shutting down attached tenant");
-                match tenant.shutdown(progress, false).await {
+                match tenant.shutdown(progress, ShutdownMode::Hard).await {
                    Ok(()) => {}
                    Err(barrier) => {
                        info!("Shutdown already in progress, waiting for it to complete");
@@ -1223,7 +1229,7 @@ impl TenantManager {
                    TenantSlot::Attached(tenant) => {
                        let (_guard, progress) = utils::completion::channel();
                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
-                        match tenant.shutdown(progress, false).await {
+                        match tenant.shutdown(progress, ShutdownMode::Hard).await {
                            Ok(()) => {
                                info!("Finished shutting down just-spawned tenant");
                            }
@@ -1273,7 +1279,7 @@ impl TenantManager {
        };
        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
+        match tenant.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
@@ -1411,9 +1417,15 @@ impl TenantManager {
        match tenant.current_state() {
            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
+                // If deletion is already in progress, return success (the semantics of this
-                // handle it: broken tenants proceed to delete, stopping tenants
+                // function are to rerturn success afterr deletion is spawned in background).
-                // are checked for deletion already in progress.
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
                if DeleteTenantFlow::is_in_progress(&tenant) {
                    // The `delete_progress` lock is held: deletion is already happening
                    // in the bacckground
                    slot_guard.revert();
                    return Ok(());
                }
            }
            _ => {
                tenant
@@ -1649,7 +1661,14 @@ impl TenantManager {
                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
                        "failpoint"
                    )));
-                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
+                    if let Err(e) = timeline
                        .wait_lsn(
                            *target_lsn,
                            crate::tenant::timeline::WaitLsnWaiter::Tenant,
                            ctx,
                        )
                        .await
                    {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
                        tracing::warn!(
@@ -1670,7 +1689,7 @@ impl TenantManager {
        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
-        match parent.shutdown(progress, false).await {
+        match parent.shutdown(progress, ShutdownMode::Hard).await {
            Ok(()) => {}
            Err(other) => {
                other.wait().await;
@@ -2657,11 +2676,11 @@ where
    let attached_tenant = match slot_guard.get_old_value() {
        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
-            let freeze_and_flush = false;
+            let shutdown_mode = ShutdownMode::Hard;
            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, shutdown_mode).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -200,14 +200,17 @@ use utils::backoff::{
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
 };
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
-use crate::deletion_queue::DeletionQueueClient;
+use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
    RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
@@ -235,6 +238,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -261,6 +265,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// Doing non-essential flushes of deletion queue is subject to this timeout, after
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -531,9 +539,10 @@ impl RemoteTimelineClient {
    // Upload operations.
    //
    ///
    /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
    ///
    /// This should only be used to upload initial metadata to remote storage.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -545,7 +554,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -561,6 +570,27 @@ impl RemoteTimelineClient {
        Ok(())
    }
    /// Launch an index-file upload operation in the background, with only parts of the metadata
    /// updated.
    ///
    /// This is the regular way of updating metadata on layer flushes or Gc.
    ///
    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
    /// `index_part.json`, while being more clear on what values update regularly.
    pub(crate) fn schedule_index_upload_for_metadata_update(
        self: &Arc<Self>,
        update: &MetadataUpdate,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        upload_queue.latest_metadata.apply(update);
        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
        Ok(())
    }
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -588,14 +618,14 @@ impl RemoteTimelineClient {
        upload_queue: &mut UploadQueueInitialized,
        metadata: TimelineMetadata,
    ) {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        info!(
-            "scheduling metadata upload with {} files ({} changed)",
+            "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
            upload_queue.latest_files.len(),
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
@@ -1050,6 +1080,26 @@ impl RemoteTimelineClient {
        Ok(())
    }
    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
        match tokio::time::timeout(
            DELETION_QUEUE_FLUSH_TIMEOUT,
            self.deletion_queue_client.flush_immediate(),
        )
        .await
        {
            Ok(result) => result,
            Err(_timeout) => {
                // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and
                // to ensure that _usually_ objects are really gone after a DELETE is acked.  However, in case of deletion
                // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here.
                tracing::warn!(
                    "Timed out waiting for deletion queue flush, acking deletion anyway"
                );
                Ok(())
            }
        }
    }
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1097,23 +1147,29 @@ impl RemoteTimelineClient {
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
        let cancel = shutdown_token();
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
                        Some(&timeline_storage_path),
                        ListingMode::NoDelimiter,
                        None,
                        &cancel,
                    )
                    .await
            },
            "list remaining files",
            &cancel,
        )
        .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
        .keys;
        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1173,7 +1229,7 @@ impl RemoteTimelineClient {
        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
        // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.deletion_queue_client.flush_immediate().await?;
+        self.flush_deletion_queue().await?;
        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -1569,7 +1625,7 @@ impl RemoteTimelineClient {
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
-    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
    pub(crate) fn stop(&self) {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
@@ -1999,7 +2055,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
            storage
                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
                .await
        },
        "list index_part files",
        cancel,
    )
-    .await?;
+    .await?
    .keys;
    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
    let warn_after = 3;
    let max_attempts = 10;
    let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
        // Also recover the unsharded prefix for a shard of zero:
        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
        // - if the tenant is sharded, we still want to recover the initdb data, but we only
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -51,7 +51,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
-    id::TimelineId,
+    id::TimelineId, serde_system_time,
 };
 use super::{
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    (detail.last_download, detail.next_download.unwrap())
                };
-                if now < next_download {
+                if now > next_download {
                    Some(PendingDownload {
                        secondary_state: secondary_tenant,
                        last_download,
@@ -591,7 +591,7 @@ impl<'a> TenantDownloader<'a> {
        let mut progress = SecondaryProgress {
            layers_total: heatmap_stats.layers,
            bytes_total: heatmap_stats.bytes,
-            heatmap_mtime: Some(heatmap_mtime),
+            heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)),
            layers_downloaded: 0,
            bytes_downloaded: 0,
        };
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
                progress.bytes_downloaded += layer_byte_count;
                progress.layers_downloaded += layer_count;
            }
            for delete_timeline in &delete_timelines {
                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                // from disk fails that will be a fatal error.
                detail.timelines.remove(delete_timeline);
            }
        }
        // Execute accumulated deletions
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
                    .await
                    .map_err(UpdateError::from)?;
                SECONDARY_MODE.download_heatmap.inc();
                if Some(&download.etag) == prev_etag {
                    Ok(HeatMapDownload::Unmodified)
                } else {
                    let mut heatmap_bytes = Vec::new();
                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
                    SECONDARY_MODE.download_heatmap.inc();
                    Ok(HeatMapDownload::Modified(HeatMapModified {
                        etag: download.etag,
                        last_modified: download.last_modified,
@@ -786,6 +793,35 @@ impl<'a> TenantDownloader<'a> {
            // Existing on-disk layers: just update their access time.
            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
                tracing::debug!("Layer {} is already on disk", layer.name);
                if cfg!(debug_assertions) {
                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                    // are already present on disk are really there.
                    let local_path = self
                        .conf
                        .timeline_path(tenant_shard_id, &timeline.timeline_id)
                        .join(layer.name.file_name());
                    match tokio::fs::metadata(&local_path).await {
                        Ok(meta) => {
                            tracing::debug!(
                                "Layer {} present at {}, size {}",
                                layer.name,
                                local_path,
                                meta.len(),
                            );
                        }
                        Err(e) => {
                            tracing::warn!(
                                "Layer {} not found at {} ({})",
                                layer.name,
                                local_path,
                                e
                            );
                            debug_assert!(false);
                        }
                    }
                }
                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
                    || on_disk.access_time != layer.access_time
                {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
-use super::layer_map::InMemoryLayerHandle;
+use self::inmemory_layer::InMemoryLayerFileId;
-use super::timeline::layer_manager::LayerManager;
+
 use super::timeline::GetVectoredError;
 use super::PageReconstructError;
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
    keys_done: KeySpaceRandomAccum,
    layers_visited: u32,
 }
 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
            layers_visited: 0,
        }
    }
@@ -138,6 +140,14 @@ impl ValuesReconstructState {
        }
    }
    pub(crate) fn on_layer_visited(&mut self) {
        self.layers_visited += 1;
    }
    pub(crate) fn get_layers_visited(&self) -> u32 {
        self.layers_visited
    }
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
@@ -204,23 +214,30 @@ impl Default for ValuesReconstructState {
    }
 }
-/// Description of layer to be read - the layer map can turn
+/// A key that uniquely identifies a layer in a timeline
-/// this description into the actual layer.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
-#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum LayerId {
-pub(crate) enum ReadableLayerDesc {
+    PersitentLayerId(PersistentLayerKey),
-    Persistent {
+    InMemoryLayerId(InMemoryLayerFileId),
        desc: PersistentLayerDesc,
        lsn_range: Range<Lsn>,
    },
    InMemory {
        handle: InMemoryLayerHandle,
        lsn_ceil: Lsn,
    },
 }
-/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+/// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
 #[derive(Debug)]
-struct ReadableLayerDescOrdered(ReadableLayerDesc);
+pub(crate) enum ReadableLayer {
    PersistentLayer(Layer),
    InMemoryLayer(Arc<InMemoryLayer>),
 }
 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
 struct ReadDesc {
    /// An id used to resolve the readable layer within the fringe
    layer_id: LayerId,
    /// Lsn range for the read, used for selecting the next read
    lsn_range: Range<Lsn>,
 }
 /// Data structure which maintains a fringe of layers for the
 /// read path. The fringe is the set of layers which intersects
@@ -231,41 +248,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc);
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<ReadableLayerDesc, KeySpace>,
+    layers: HashMap<LayerId, LayerKeyspace>,
 }
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
    target_keyspace: KeySpace,
 }
 impl LayerFringe {
    pub(crate) fn new() -> Self {
        LayerFringe {
-            layers_by_lsn: BinaryHeap::new(),
+            planned_reads_by_lsn: BinaryHeap::new(),
            layers: HashMap::new(),
        }
    }
-    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let handle = match self.layers_by_lsn.pop() {
+        let read_desc = match self.planned_reads_by_lsn.pop() {
-            Some(h) => h,
+            Some(desc) => desc,
            None => return None,
        };
-        let removed = self.layers.remove_entry(&handle.0);
+        let removed = self.layers.remove_entry(&read_desc.layer_id);
        match removed {
-            Some((layer, keyspace)) => Some((layer, keyspace)),
+            Some((
                _,
                LayerKeyspace {
                    layer,
                    target_keyspace,
                },
            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
-    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+    pub(crate) fn update(
-        let entry = self.layers.entry(layer.clone());
+        &mut self,
        layer: ReadableLayer,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
    ) {
        let layer_id = layer.id();
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().merge(&keyspace);
+                entry.get_mut().target_keyspace.merge(&keyspace);
            }
            Entry::Vacant(entry) => {
-                self.layers_by_lsn
+                self.planned_reads_by_lsn.push(ReadDesc {
-                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                    lsn_range,
-                entry.insert(keyspace);
+                    layer_id: layer_id.clone(),
                });
                entry.insert(LayerKeyspace {
                    layer,
                    target_keyspace: keyspace,
                });
            }
        }
    }
@@ -277,77 +317,55 @@ impl Default for LayerFringe {
    }
 }
-impl Ord for ReadableLayerDescOrdered {
+impl Ord for ReadDesc {
    fn cmp(&self, other: &Self) -> Ordering {
-        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
        if ord == std::cmp::Ordering::Equal {
-            self.0
+            self.lsn_range.start.cmp(&other.lsn_range.start).reverse()
                .get_lsn_floor()
                .cmp(&other.0.get_lsn_floor())
                .reverse()
        } else {
            ord
        }
    }
 }
-impl PartialOrd for ReadableLayerDescOrdered {
+impl PartialOrd for ReadDesc {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
 }
-impl PartialEq for ReadableLayerDescOrdered {
+impl PartialEq for ReadDesc {
    fn eq(&self, other: &Self) -> bool {
-        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+        self.lsn_range == other.lsn_range
            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
    }
 }
-impl Eq for ReadableLayerDescOrdered {}
+impl Eq for ReadDesc {}
-impl ReadableLayerDesc {
+impl ReadableLayer {
-    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+    pub(crate) fn id(&self) -> LayerId {
        match self {
-            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
+            Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()),
-            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+            Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()),
        }
    }
    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
        match self {
            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        layer_manager: &LayerManager,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayerDesc::Persistent { desc, lsn_range } => {
+            ReadableLayer::PersistentLayer(layer) => {
                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                        keyspace,
                        lsn_range.clone(),
                        reconstruct_state,
                        ctx,
                    )
                    .await
            }
-            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+            ReadableLayer::InMemoryLayer(layer) => {
                let layer = layer_manager
                    .layer_map()
                    .get_in_memory_layer(handle)
                    .unwrap();
                layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "index", and
+//! Every delta file consists of three parts: "summary", "values", and
-//! "values". The summary is a fixed size header at the beginning of the file,
+//! "index". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -47,6 +47,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
@@ -727,6 +728,9 @@ impl DeltaLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
            // mask out the timeline_id, but still require the layers to be from the same tenant
            expected_summary.timeline_id = actual_summary.timeline_id;
            if actual_summary != expected_summary {
                bail!(
                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
@@ -862,7 +866,7 @@ impl DeltaLayerInner {
                .into(),
        );
-        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let data_end_offset = self.index_start_offset();
        let reads = Self::plan_reads(
            keyspace,
@@ -938,7 +942,7 @@ impl DeltaLayerInner {
            }
            if !range_end_handled {
-                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                tracing::debug!("Handling range end fallback at {}", data_end_offset);
                planner.handle_range_end(data_end_offset);
            }
        }
@@ -946,6 +950,34 @@ impl DeltaLayerInner {
        Ok(planner.finish())
    }
    fn get_min_read_buffer_size(
        planned_reads: &[VectoredRead],
        read_size_soft_max: usize,
    ) -> usize {
        let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else {
            return read_size_soft_max;
        };
        let largest_read_size = largest_read.size();
        if largest_read_size > read_size_soft_max {
            // If the read is oversized, it should only contain one key.
            let offenders = largest_read
                .blobs_at
                .as_slice()
                .iter()
                .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
                .join(", ");
            tracing::warn!(
                "Oversized vectored read ({} > {}) for keys {}",
                largest_read_size,
                read_size_soft_max,
                offenders
            );
        }
        largest_read_size
    }
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -959,7 +991,8 @@ impl DeltaLayerInner {
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
-        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
        let mut buf = Some(BytesMut::with_capacity(buf_size));
        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -986,7 +1019,7 @@ impl DeltaLayerInner {
                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+                    buf = Some(BytesMut::with_capacity(buf_size));
                    continue;
                }
@@ -1073,11 +1106,195 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_offset() - last.size;
        }
        Ok(all_keys)
    }
    /// Using the given writer, write out a truncated version, where LSNs higher than the
    /// truncate_at are missing.
    #[cfg(test)]
    pub(super) async fn copy_prefix(
        &self,
        writer: &mut DeltaLayerWriter,
        truncate_at: Lsn,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        use crate::tenant::vectored_blob_io::{
            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
        };
        use futures::stream::TryStreamExt;
        #[derive(Debug)]
        enum Item {
            Actual(Key, Lsn, BlobRef),
            Sentinel,
        }
        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
            fn from(value: Item) -> Self {
                match value {
                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
                    Item::Sentinel => None,
                }
            }
        }
        impl Item {
            fn offset(&self) -> Option<BlobRef> {
                match self {
                    Item::Actual(_, _, blob) => Some(*blob),
                    Item::Sentinel => None,
                }
            }
            fn is_last(&self) -> bool {
                matches!(self, Item::Sentinel)
            }
        }
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
            block_reader,
        );
        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
        // put in a sentinel value for getting the end offset for last item, and not having to
        // repeat the whole read part
        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
            Item::Sentinel,
        ))));
        let mut stream = std::pin::pin!(stream);
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
        let mut read_builder: Option<VectoredReadBuilder> = None;
        let max_read_size = self
            .max_vectored_read_bytes
            .map(|x| x.0.get())
            .unwrap_or(8192);
        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
        // FIXME: buffering of DeltaLayerWriter
        let mut per_blob_copy = Vec::new();
        while let Some(item) = stream.try_next().await? {
            tracing::debug!(?item, "popped");
            let offset = item
                .offset()
                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                let end_offset = offset;
                Some((BlobMeta { key, lsn }, start_offset..end_offset))
            } else {
                None
            };
            let is_last = item.is_last();
            prev = Option::from(item);
            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
            let builder = if let Some((meta, offsets)) = actionable {
                // extend or create a new builder
                if read_builder
                    .as_mut()
                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
                    .unwrap_or(VectoredReadExtended::No)
                    == VectoredReadExtended::Yes
                {
                    None
                } else {
                    read_builder.replace(VectoredReadBuilder::new(
                        offsets.start.pos(),
                        offsets.end.pos(),
                        meta,
                        max_read_size,
                    ))
                }
            } else {
                // nothing to do, except perhaps flush any existing for the last element
                None
            };
            // flush the possible older builder and also the new one if the item was the last one
            let builders = builder.into_iter();
            let builders = if is_last {
                builders.chain(read_builder.take())
            } else {
                builders.chain(None)
            };
            for builder in builders {
                let read = builder.build();
                let reader = VectoredBlobReader::new(&self.file);
                let mut buf = buffer.take().unwrap();
                buf.clear();
                buf.reserve(read.size());
                let res = reader.read_blobs(&read, buf).await?;
                for blob in res.blobs {
                    let key = blob.meta.key;
                    let lsn = blob.meta.lsn;
                    let data = &res.buf[blob.start..blob.end];
                    #[cfg(debug_assertions)]
                    Value::des(data)
                        .with_context(|| {
                            format!(
                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
                                blob.meta.key,
                                blob.meta.lsn,
                                blob.start,
                                blob.end,
                                utils::Hex(data)
                            )
                        })
                        .unwrap();
                    // is it an image or will_init walrecord?
                    // FIXME: this could be handled by threading the BlobRef to the
                    // VectoredReadBuilder
                    let will_init = crate::repository::ValueBytes::will_init(data)
                        .inspect_err(|_e| {
                            #[cfg(feature = "testing")]
                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
                        })
                        .unwrap_or(false);
                    per_blob_copy.clear();
                    per_blob_copy.extend_from_slice(data);
                    let (tmp, res) = writer
                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
                        .await;
                    per_blob_copy = tmp;
                    res?;
                }
                buffer = Some(res.buf);
            }
        }
        assert!(
            read_builder.is_none(),
            "with the sentinel above loop should had handled all"
        );
        Ok(())
    }
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
        println!(
            "index_start_blk: {}, root {}",
@@ -1147,6 +1364,44 @@ impl DeltaLayerInner {
        Ok(())
    }
    #[cfg(test)]
    fn stream_index_forwards<'a, R>(
        &'a self,
        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
        start: &'a [u8; DELTA_KEY_SIZE],
        ctx: &'a RequestContext,
    ) -> impl futures::stream::Stream<
        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
    > + 'a
    where
        R: BlockReader,
    {
        use futures::stream::TryStreamExt;
        let stream = reader.get_stream_from(start, ctx);
        stream.map_ok(|(key, value)| {
            let key = DeltaKey::from_slice(&key);
            let (key, lsn) = (key.key(), key.lsn());
            let offset = BlobRef(value);
            (key, lsn, offset)
        })
    }
    /// The file offset to the first block of index.
    ///
    /// The file structure is summary, values, and index. We often need this for the size of last blob.
    fn index_start_offset(&self) -> u64 {
        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
        let bref = BlobRef(offset);
        tracing::debug!(
            index_start_blk = self.index_start_blk,
            offset,
            pos = bref.pos(),
            "index_start_offset"
        );
        offset
    }
 }
 /// A set of data associated with a delta layer key and its value
@@ -1210,9 +1465,16 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 mod test {
    use std::collections::BTreeMap;
    use itertools::MinMaxResult;
    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
    use rand::RngCore;
    use super::*;
    use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind, tenant::disk_btree::tests::TestDisk,
+        context::DownloadBehavior,
        task_mgr::TaskKind,
        tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
        DEFAULT_PG_VERSION,
    };
    /// Construct an index for a fictional delta layer and and then
@@ -1332,4 +1594,442 @@ mod test {
        assert_eq!(planned_blobs, expected_blobs);
    }
    mod constants {
        use utils::lsn::Lsn;
        /// Offset used by all lsns in this test
        pub(super) const LSN_OFFSET: Lsn = Lsn(0x08);
        /// Number of unique keys including in the test data
        pub(super) const KEY_COUNT: u8 = 60;
        /// Max number of different lsns for each key
        pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20;
        /// Possible value sizes for each key along with a probability weight
        pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)];
        /// Probability that there will be a gap between the current key and the next one (33.3%)
        pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)];
        /// The minimum size of a key range in all the generated reads
        pub(super) const MIN_RANGE_SIZE: i128 = 10;
        /// The number of ranges included in each vectored read
        pub(super) const RANGES_COUNT: u8 = 2;
        /// The number of vectored reads performed
        pub(super) const READS_COUNT: u8 = 100;
        /// Soft max size of a vectored read. Will be violated if we have to read keys
        /// with values larger than the limit
        pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024;
    }
    struct Entry {
        key: Key,
        lsn: Lsn,
        value: Vec<u8>,
    }
    fn generate_entries(rng: &mut StdRng) -> Vec<Entry> {
        let mut current_key = Key::MIN;
        let mut entries = Vec::new();
        for _ in 0..constants::KEY_COUNT {
            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
            let mut lsns_iter =
                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
                    Some(Lsn(lsn.0 + 0x08))
                });
            let mut lsns = Vec::new();
            while lsns.len() < count as usize {
                let take = rng.gen_bool(0.5);
                let lsn = lsns_iter.next().unwrap();
                if take {
                    lsns.push(lsn);
                }
            }
            for lsn in lsns {
                let size = constants::VALUE_SIZES
                    .choose_weighted(rng, |item| item.1)
                    .unwrap()
                    .0;
                let mut buf = vec![0; size];
                rng.fill_bytes(&mut buf);
                entries.push(Entry {
                    key: current_key,
                    lsn,
                    value: buf,
                })
            }
            let gap = constants::KEY_GAP_CHANGES
                .choose_weighted(rng, |item| item.1)
                .unwrap()
                .0;
            if gap {
                current_key = current_key.add(2);
            } else {
                current_key = current_key.add(1);
            }
        }
        entries
    }
    struct EntriesMeta {
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
        index: BTreeMap<(Key, Lsn), Vec<u8>>,
    }
    fn get_entries_meta(entries: &[Entry]) -> EntriesMeta {
        let key_range = match entries.iter().minmax_by_key(|e| e.key) {
            MinMaxResult::MinMax(min, max) => min.key..max.key.next(),
            _ => panic!("More than one entry is always expected"),
        };
        let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) {
            MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1),
            _ => panic!("More than one entry is always expected"),
        };
        let mut index = BTreeMap::new();
        for entry in entries.iter() {
            index.insert((entry.key, entry.lsn), entry.value.clone());
        }
        EntriesMeta {
            key_range,
            lsn_range,
            index,
        }
    }
    fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range<Key>) -> KeySpace {
        let start = key_range.start.to_i128();
        let end = key_range.end.to_i128();
        let mut keyspace = KeySpace::default();
        for _ in 0..constants::RANGES_COUNT {
            let mut range: Option<Range<Key>> = Option::default();
            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
                let range_start = rng.gen_range(start..end);
                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
                if range_end_offset >= end {
                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
                } else {
                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
                }
            }
            keyspace.ranges.push(range.unwrap());
        }
        keyspace
    }
    #[tokio::test]
    async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
        let (tenant, ctx) = harness.load().await;
        let timeline_id = TimelineId::generate();
        let timeline = tenant
            .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx)
            .await?;
        tracing::info!("Generating test data ...");
        let rng = &mut StdRng::seed_from_u64(0);
        let entries = generate_entries(rng);
        let entries_meta = get_entries_meta(&entries);
        tracing::info!("Done generating {} entries", entries.len());
        tracing::info!("Writing test data to delta layer ...");
        let mut writer = DeltaLayerWriter::new(
            harness.conf,
            timeline_id,
            harness.tenant_shard_id,
            entries_meta.key_range.start,
            entries_meta.lsn_range.clone(),
        )
        .await?;
        for entry in entries {
            let (_, res) = writer
                .put_value_bytes(entry.key, entry.lsn, entry.value, false)
                .await;
            res?;
        }
        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
        let inner = resident.as_delta(&ctx).await?;
        let file_size = inner.file.metadata().await?.len();
        tracing::info!(
            "Done writing test data to delta layer. Resulting file size is: {}",
            file_size
        );
        for i in 0..constants::READS_COUNT {
            tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT);
            let block_reader = FileBlockReader::new(&inner.file, inner.file_id);
            let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
                inner.index_start_blk,
                inner.index_root_blk,
                block_reader,
            );
            let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES);
            let mut reconstruct_state = ValuesReconstructState::new();
            let keyspace = pick_random_keyspace(rng, &entries_meta.key_range);
            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;
            let vectored_reads = DeltaLayerInner::plan_reads(
                keyspace.clone(),
                entries_meta.lsn_range.clone(),
                data_end_offset,
                index_reader,
                planner,
                &mut reconstruct_state,
                &ctx,
            )
            .await?;
            let vectored_blob_reader = VectoredBlobReader::new(&inner.file);
            let buf_size = DeltaLayerInner::get_min_read_buffer_size(
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
            let mut buf = Some(BytesMut::with_capacity(buf_size));
            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
                    .await?;
                for meta in blobs_buf.blobs.iter() {
                    let value = &blobs_buf.buf[meta.start..meta.end];
                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
                }
                buf = Some(blobs_buf.buf);
            }
        }
        Ok(())
    }
    #[tokio::test]
    async fn copy_delta_prefix_smoke() {
        use crate::walrecord::NeonWalRecord;
        use bytes::Bytes;
        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
            .await
            .unwrap();
        let initdb_layer = timeline
            .layers
            .read()
            .await
            .likely_resident_layers()
            .next()
            .unwrap();
        {
            let mut writer = timeline.writer().await;
            let data = [
                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
                (
                    0x30,
                    12,
                    Value::WalRecord(NeonWalRecord::Postgres {
                        will_init: false,
                        rec: Bytes::from_static(b"1"),
                    }),
                ),
                (
                    0x40,
                    12,
                    Value::WalRecord(NeonWalRecord::Postgres {
                        will_init: true,
                        rec: Bytes::from_static(b"2"),
                    }),
                ),
                // build an oversized value so we cannot extend and existing read over
                // this
                (
                    0x50,
                    12,
                    Value::WalRecord(NeonWalRecord::Postgres {
                        will_init: true,
                        rec: {
                            let mut buf =
                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
                            buf.iter_mut()
                                .enumerate()
                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
                            Bytes::from(buf)
                        },
                    }),
                ),
                // because the oversized read cannot be extended further, we are sure to exercise the
                // builder created on the last round with this:
                (
                    0x60,
                    12,
                    Value::WalRecord(NeonWalRecord::Postgres {
                        will_init: true,
                        rec: Bytes::from_static(b"3"),
                    }),
                ),
                (
                    0x60,
                    9,
                    Value::Image(Bytes::from_static(b"something for a different key")),
                ),
            ];
            let mut last_lsn = None;
            for (lsn, key, value) in data {
                let key = Key::from_i128(key);
                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
                last_lsn = Some(lsn);
            }
            writer.finish_write(Lsn(last_lsn.unwrap()));
        }
        timeline.freeze_and_flush().await.unwrap();
        let new_layer = timeline
            .layers
            .read()
            .await
            .likely_resident_layers()
            .find(|x| x != &initdb_layer)
            .unwrap();
        // create a copy for the timeline, so we don't overwrite the file
        let branch = tenant
            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
            .await
            .unwrap();
        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
        // a single key
        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
            let truncate_at = Lsn(truncate_at);
            let mut writer = DeltaLayerWriter::new(
                tenant.conf,
                branch.timeline_id,
                tenant.tenant_shard_id,
                Key::MIN,
                Lsn(0x11)..truncate_at,
            )
            .await
            .unwrap();
            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
            new_layer
                .copy_delta_prefix(&mut writer, truncate_at, ctx)
                .await
                .unwrap();
            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
            copied_layer.as_delta(ctx).await.unwrap();
            assert_keys_and_values_eq(
                new_layer.as_delta(ctx).await.unwrap(),
                copied_layer.as_delta(ctx).await.unwrap(),
                truncate_at,
                ctx,
            )
            .await;
        }
    }
    async fn assert_keys_and_values_eq(
        source: &DeltaLayerInner,
        truncated: &DeltaLayerInner,
        truncated_at: Lsn,
        ctx: &RequestContext,
    ) {
        use futures::future::ready;
        use futures::stream::TryStreamExt;
        let start_key = [0u8; DELTA_KEY_SIZE];
        let source_reader = FileBlockReader::new(&source.file, source.file_id);
        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            source.index_start_blk,
            source.index_root_blk,
            &source_reader,
        );
        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
        let source_stream = source_stream.filter(|res| match res {
            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
            _ => ready(true),
        });
        let mut source_stream = std::pin::pin!(source_stream);
        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            truncated.index_start_blk,
            truncated.index_root_blk,
            &truncated_reader,
        );
        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
        let mut truncated_stream = std::pin::pin!(truncated_stream);
        let mut scratch_left = Vec::new();
        let mut scratch_right = Vec::new();
        loop {
            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
            if src.is_none() {
                assert!(truncated.is_none());
                break;
            }
            let (src, truncated) = (src.unwrap(), truncated.unwrap());
            // because we've filtered the source with Lsn, we should always have the same keys from both.
            assert_eq!(src.0, truncated.0);
            assert_eq!(src.1, truncated.1);
            // if this is needed for something else, just drop this assert.
            assert!(
                src.2.pos() >= truncated.2.pos(),
                "value position should not go backwards {} vs. {}",
                src.2.pos(),
                truncated.2.pos()
            );
            scratch_left.clear();
            let src_cursor = source_reader.block_cursor();
            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
            scratch_right.clear();
            let trunc_cursor = truncated_reader.block_cursor();
            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
            tokio::try_join!(left, right).unwrap();
            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
        }
    }
 }
--- a/Show More
+++ b/Show More