proxy: remove self-signed flag logic

Make python Safekeeper datadir Path instead of str.
safekeeper: test pull_timeline with WAL gc.
2026-06-29 18:20:38 +00:00 · 2024-05-25 16:24:41 +01:00 · 2024-05-25 06:06:32 +03:00 · 2024-05-25 06:06:32 +03:00 · 2024-05-24 17:56:12 +01:00 · 2024-05-24 14:11:51 +01:00
370 changed files with 26999 additions and 9958 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = { period = "60s", terminate-after = 3 }
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
 !patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,12 +1,11 @@
 self-hosted-runner:
  labels:
    - arm64
    - dev
    - gen3
    - large
-    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
+    - large-arm64
    - macos-14
    - small
    - small-arm64
    - us-east-2
 config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project to create Branch in'
+    description: 'ID of the Project to create Branch in'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
 outputs:
  dsn:
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project which should be deleted'
+    description: 'ID of the Project which should be deleted'
    required: true
  branch_id:
-    desctiption: 'ID of the branch to delete'
+    description: 'ID of the branch to delete'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
 runs:
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  region_id:
-    desctiption: 'Region ID, if not set the project will be created in the default region'
+    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    desctiption: 'Postgres version; default is 15'
+    description: 'Postgres version; default is 15'
-    default: 15
+    default: '15'
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
  provisioner:
-    desctiption: 'k8s-pod or k8s-neonvm'
+    description: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
  compute_units:
-    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'
 outputs:
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project to delete'
+    description: 'ID of the Project to delete'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
 runs:
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -39,7 +39,7 @@ jobs:
      matrix:
        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
    env:
      IMAGE_TAG: ${{ inputs.image-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -236,27 +236,6 @@ jobs:
          submodules: true
          fetch-depth: 1
      - name: Check Postgres submodules revision
        shell: bash -euo pipefail {0}
        run: |
          # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
          FAILED=false
          for postgres in postgres-v14 postgres-v15 postgres-v16; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
              echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
              FAILED=true
            fi
          done
          if [ "${FAILED}" = "true" ]; then
            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
            exit 1
          fi
      - name: Set pg 14 revision for caching
        id: pg_v14_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -362,6 +341,9 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
          #nextest does not yet support running doctests
          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
          for io_engine in std-fs tokio-epoll-uring ; do
            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
          done
@@ -477,6 +459,8 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
          PAGESERVER_VALIDATE_VEC_GET: true
      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -556,12 +540,33 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
    runs-on: ubuntu-latest
    steps:
    - uses: slackapi/slack-github-action@v1
      with:
        channel-id: C060CNA47S9 # on-call-staging-storage-stream
        slack-message: |
          Benchmarks failed on main: ${{ github.event.head_commit.url }}
          Allure report: ${{ needs.create-test-report.outputs.report-url }}
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  create-test-report:
    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -718,9 +723,13 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit
-  neon-image:
+  neon-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    strategy:
      matrix:
        arch: [ x64, arm64 ]
    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
    steps:
      - name: Checkout
@@ -742,12 +751,6 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      - uses: docker/login-action@v3
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
      - uses: docker/build-push-action@v5
        with:
          context: .
@@ -759,25 +762,52 @@ jobs:
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
            neondatabase/neon:${{needs.tag.outputs.build-tag}}
      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom
-  compute-node-image:
+  neon-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
+    needs: [ neon-image-arch, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: ubuntu-latest
    steps:
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      - name: Create multi-arch image
        run: |
          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
      - uses: docker/login-action@v3
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
      - name: Push multi-arch image to ECR
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
  compute-node-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
        arch: [ x64, arm64 ]
    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
    steps:
      - name: Checkout
@@ -824,15 +854,14 @@ jobs:
          push: true
          pull: true
          file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
+        if: matrix.version == 'v16'
        uses: docker/build-push-action@v5
        with:
          target: compute-tools-image
@@ -846,14 +875,57 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    runs-on: ubuntu-latest
    strategy:
      matrix:
        version: [ v14, v15, v16 ]
    steps:
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      - name: Create multi-arch compute-node image
        run: |
          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
      - name: Create multi-arch compute-tools image
        if: matrix.version == 'v16'
        run: |
          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
      - uses: docker/login-action@v3
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
      - name: Push multi-arch compute-tools image to ECR
        if: matrix.version == 'v16'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
@@ -861,11 +933,8 @@ jobs:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.29.3
    steps:
      - name: Checkout
@@ -878,26 +947,48 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder
      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
      # The default value is ~/.docker
      - name: Set custom docker config directory
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
      - uses: docker/login-action@v3
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
      - name: Build vm image
        run: |
          ./vm-builder \
            -spec=vm-image-spec.yaml \
-            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
      - name: Pushing vm-compute-node image
        run: |
-          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom
  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
      fail-fast: false
      matrix:
        arch: [ x64, arm64 ]
    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
    steps:
      - name: Checkout
@@ -915,7 +1006,7 @@ jobs:
      - name: Verify image versions
        shell: bash # ensure no set -e for better error messages
        run: |
-          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
          echo "Pageserver version string: $pageserver_version"
@@ -941,78 +1032,48 @@ jobs:
  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    runs-on: ubuntu-latest
-    container: golang:1.19-bullseye
+
-    # Don't add if-condition here.
+    env:
-    # The job should always be run because we have dependant other jobs that shouldn't be skipped
+      VERSIONS: v14 v15 v16
    steps:
-      - name: Install Crane & ECR helper
+      - uses: docker/login-action@v3
-        run: |
+        with:
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      - name: Configure ECR login
+      - uses: docker/login-action@v3
-        run: |
+        with:
-          mkdir /github/home/.docker/
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-      - name: Copy vm-compute-node images to Docker Hub
+      - name: Copy vm-compute-node images to ECR
        run: |
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          for version in ${VERSIONS}; do
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
+                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done
      - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'main'
        run: |
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+            docker buildx imagetools create -t $repo/neon:latest \
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-      - name: Push images to production ECR
+            docker buildx imagetools create -t $repo/compute-tools:latest \
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
-      - name: Configure Docker Hub login
+            for version in ${VERSIONS}; do
-        run: |
+              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          echo "" > /github/home/.docker/config.json
          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
-      - name: Push vm-compute-node to Docker Hub
+              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-        run: |
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+            done
-          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          done
          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
      - name: Push latest tags to Docker Hub
        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
  check-linux-arm-build:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
    env:
      # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
      - name: Run cargo build
        run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
      - name: Run cargo test
        env:
          NEXTEST_RETRIES: 3
        run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
  check-codestyle-rust-arm:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,6 +269,11 @@ jobs:
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
      options: --init
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
    steps:
      - name: Fix git ownership
        run: |
@@ -305,31 +310,35 @@ jobs:
            exit 1
          fi
          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
      - name: Run cargo clippy (debug)
        if: matrix.build_type == 'debug'
        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
      - name: Run cargo clippy (release)
        if: matrix.build_type == 'release'
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
+        if: matrix.build_type == 'release'
        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
        env:
            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: cargo fmt --all -- --check
      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
      - name: Check rust dependencies
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: |
          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
        run: cargo deny check
  gather-rust-build-stats:
@@ -338,7 +347,7 @@ jobs:
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
      credentials:
@@ -369,7 +378,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)
      - name: Produce the build stats
-        run: cargo build --all --release --timings
+        run: cargo build --all --release --timings -j$(nproc)
      - name: Upload the build stats
        id: upload-stats
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Storage & Compute release ${RELEASE_DATE}
          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,25 +41,26 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.18"
+azure_core = "0.19"
-azure_identity = "0.18"
+azure_identity = "0.19"
-azure_storage = "0.18"
+azure_storage = "0.19"
-azure_storage_blobs = "0.18"
+azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
+aws-config = { version = "1.3", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.14"
+aws-sdk-s3 = "1.26"
 aws-sdk-iam = "1.15.0"
-aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
+aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.4"
+aws-smithy-types = "1.1.9"
-aws-credential-types = "1.1.4"
+aws-credential-types = "1.2.0"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
+aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
-aws-types = "1.1.7"
+aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
@@ -81,13 +83,14 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.13"
+hashbrown = "0.14"
-hashlink = "0.8.4"
+hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -98,7 +101,8 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -120,8 +124,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -130,10 +134,10 @@ prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
-reqwest-middleware = "0.2.0"
+reqwest-middleware = "0.3.0"
-reqwest-retry = "0.2.2"
+reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -143,7 +147,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -157,7 +161,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
 svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -176,10 +181,11 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.20.0"
+tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
@@ -240,8 +246,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 # bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
 ################# Binary contents sections
@@ -252,7 +258,7 @@ debug = true
 # disable debug symbols for all packages except this one to decrease binaries size
 [profile.release.package."*"]
-debug = true
+debug = false
 [profile.release-line-debug]
 inherits = "release"
--- a/1
+++ b/1
@@ -44,7 +44,6 @@ COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .
 ENV _RJEM_MALLOC_CONF="prof:true"
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
    && mv s5cmd /usr/local/bin/s5cmd
 # LLVM
-ENV LLVM_VERSION=17
+ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,11 +241,17 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+COPY patches/pgvector.patch /pgvector.patch
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+
 # By default, pgvector Makefile uses `-march=native`. We don't want that, 
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 #########################################################################################
--- a/29
+++ b/29
@@ -25,14 +25,16 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	# macOS with brew-installed openssl requires explicit paths
+	ifndef DISABLE_HOMEBREW
-	# It can be configured with OPENSSL_PREFIX variable
+		# macOS with brew-installed openssl requires explicit paths
-	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		# It can be configured with OPENSSL_PREFIX variable
-	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
+		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 	endif
 endif
 # Use -C option so that when PostgreSQL "make install" installs the
@@ -79,11 +81,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
 		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
-	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
+	VERSION=$*; \
 	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
 	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
 		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
+		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
 # nicer alias to run 'configure'
 # Note: I've been unable to use templates for this part of our configuration.
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
 # Neon
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
 compute_api.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,10 +47,11 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use url::Url;
 use compute_api::responses::ComputeStatus;
 use compute_api::spec::ComputeSpec;
 use compute_tools::compute::{
    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -62,12 +63,41 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    let (build_tag, clap_args) = init()?;
    let (pg_handle, start_pg_result) = {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();
        let cli_args = process_cli(&clap_args)?;
        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
        start_postgres(&clap_args, wait_spec_result)?
        // Startup is finished, exit the startup tracing span
    };
    // PostgreSQL is now running, if startup was successful. Wait until it exits.
    let wait_pg_result = wait_postgres(pg_handle)?;
    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
    maybe_delay_exit(delay_exit);
    deinit_and_exit(wait_pg_result);
 }
 fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -82,9 +112,15 @@ fn main() -> Result<()> {
        .to_string();
    info!("build_tag: {build_tag}");
-    let matches = cli().get_matches();
+    Ok((build_tag, cli().get_matches()))
-    let pgbin_default = String::from("postgres");
+}
-    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
 fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
    let pgbin_default = "postgres";
    let pgbin = matches
        .get_one::<String>("pgbin")
        .map(|s| s.as_str())
        .unwrap_or(pgbin_default);
    let ext_remote_storage = matches
        .get_one::<String>("remote-ext-config")
@@ -110,7 +146,32 @@ fn main() -> Result<()> {
        .expect("Postgres connection string is required");
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");
    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
    Ok(ProcessCliResult {
        connstr,
        pgdata,
        pgbin,
        ext_remote_storage,
        http_port,
        spec_json,
        spec_path,
        resize_swap_on_bind,
    })
 }
 struct ProcessCliResult<'clap> {
    connstr: &'clap str,
    pgdata: &'clap str,
    pgbin: &'clap str,
    ext_remote_storage: Option<&'clap str>,
    http_port: u16,
    spec_json: Option<&'clap String>,
    spec_path: Option<&'clap String>,
    resize_swap_on_bind: bool,
 }
 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -147,7 +208,7 @@ fn main() -> Result<()> {
    if let Ok(val) = std::env::var("TRACESTATE") {
        startup_tracing_carrier.insert("tracestate".to_string(), val);
    }
-    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
+    if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
        use opentelemetry::sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
@@ -157,8 +218,17 @@ fn main() -> Result<()> {
        Some(guard)
    } else {
        None
-    };
+    }
 }
 fn try_spec_from_cli(
    matches: &clap::ArgMatches,
    ProcessCliResult {
        spec_json,
        spec_path,
        ..
    }: &ProcessCliResult,
 ) -> Result<CliSpecParams> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -199,6 +269,34 @@ fn main() -> Result<()> {
        }
    };
    Ok(CliSpecParams {
        spec,
        live_config_allowed,
    })
 }
 struct CliSpecParams {
    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
    spec: Option<ComputeSpec>,
    live_config_allowed: bool,
 }
 fn wait_spec(
    build_tag: String,
    ProcessCliResult {
        connstr,
        pgdata,
        pgbin,
        ext_remote_storage,
        resize_swap_on_bind,
        http_port,
        ..
    }: ProcessCliResult,
    CliSpecParams {
        spec,
        live_config_allowed,
    }: CliSpecParams,
 ) -> Result<WaitSpecResult> {
    let mut new_state = ComputeState::new();
    let spec_set;
@@ -226,19 +324,17 @@ fn main() -> Result<()> {
    // If this is a pooled VM, prewarm before starting HTTP server and becoming
    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
+    // because QEMU will already have its memory allocated from the host, and
    // the necessary binaries will already be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }
-    // Launch http service first, so we were able to serve control-plane
+    // Launch http service first, so that we can serve control-plane requests
-    // requests, while configuration is still in progress.
+    // while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
    let extension_server_port: u16 = http_port;
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -253,21 +349,45 @@ fn main() -> Result<()> {
                break;
            }
        }
        // Record for how long we slept waiting for the spec.
        let now = Utc::now();
        state.metrics.wait_for_spec_ms = now
            .signed_duration_since(state.start_time)
            .to_std()
            .unwrap()
            .as_millis() as u64;
        // Reset start time, so that the total startup time that is calculated later will
        // not include the time that we waited for the spec.
        state.start_time = now;
    }
    Ok(WaitSpecResult {
        compute,
        http_port,
        resize_swap_on_bind,
    })
 }
 struct WaitSpecResult {
    compute: Arc<ComputeNode>,
    // passed through from ProcessCliResult
    http_port: u16,
    resize_swap_on_bind: bool,
 }
 fn start_postgres(
    // need to allow unused because `matches` is only used if target_os = "linux"
    #[allow(unused_variables)] matches: &clap::ArgMatches,
    WaitSpecResult {
        compute,
        http_port,
        resize_swap_on_bind,
    }: WaitSpecResult,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
    // Record for how long we slept waiting for the spec.
    state.metrics.wait_for_spec_ms = Utc::now()
        .signed_duration_since(state.start_time)
        .to_std()
        .unwrap()
        .as_millis() as u64;
    // Reset start time to the actual start of the configuration, so that
    // total startup time was properly measured at the end.
    state.start_time = Utc::now();
    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
@@ -275,33 +395,72 @@ fn main() -> Result<()> {
        "running compute with features: {:?}",
        state.pspec.as_ref().unwrap().spec.features
    );
    // before we release the mutex, fetch the swap size (if any) for later.
    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
    drop(state);
    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute);
    let _configurator_handle = launch_configurator(&compute);
-    // Start Postgres
+    let mut prestartup_failed = false;
    let mut delay_exit = false;
-    let mut exit_code = None;
+
-    let pg = match compute.start_compute(extension_server_port) {
+    // Resize swap to the desired size if the compute spec says so
-        Ok(pg) => Some(pg),
+    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
-        Err(err) => {
+        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-            error!("could not start the compute node: {:#}", err);
+        // *before* starting postgres.
-            let mut state = compute.state.lock().unwrap();
+        //
-            state.error = Some(format!("{:?}", err));
+        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-            state.status = ComputeStatus::Failed;
+        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-            // Notify others that Postgres failed to start. In case of configuring the
+        // OOM-killed during startup because swap wasn't available yet.
-            // empty compute, it's likely that API handler is still waiting for compute
+        match resize_swap(size_bytes) {
-            // state change. With this we will notify it that compute is in Failed state,
+            Ok(()) => {
-            // so control plane will know about it earlier and record proper error instead
+                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-            // of timeout.
+                info!(%size_bytes, %size_gib, "resized swap");
-            compute.state_changed.notify_all();
+            }
-            drop(state); // unlock
+            Err(err) => {
-            delay_exit = true;
+                let err = err.context("failed to resize swap");
-            None
+                error!("{err:#}");
                // Mark compute startup as failed; don't try to start postgres, and report this
                // error to the control plane when it next asks.
                prestartup_failed = true;
                let mut state = compute.state.lock().unwrap();
                state.error = Some(format!("{err:?}"));
                state.status = ComputeStatus::Failed;
                compute.state_changed.notify_all();
                delay_exit = true;
            }
        }
-    };
+    }
    let extension_server_port: u16 = http_port;
    // Start Postgres
    let mut pg = None;
    if !prestartup_failed {
        pg = match compute.start_compute(extension_server_port) {
            Ok(pg) => Some(pg),
            Err(err) => {
                error!("could not start the compute node: {:#}", err);
                let mut state = compute.state.lock().unwrap();
                state.error = Some(format!("{:?}", err));
                state.status = ComputeStatus::Failed;
                // Notify others that Postgres failed to start. In case of configuring the
                // empty compute, it's likely that API handler is still waiting for compute
                // state change. With this we will notify it that compute is in Failed state,
                // so control plane will know about it earlier and record proper error instead
                // of timeout.
                compute.state_changed.notify_all();
                drop(state); // unlock
                delay_exit = true;
                None
            }
        };
    } else {
        warn!("skipping postgres startup because pre-startup step failed");
    }
    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
    // because it requires cgroups.
@@ -334,7 +493,7 @@ fn main() -> Result<()> {
            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();
-            let vm_monitor = &rt.as_ref().map(|rt| {
+            let vm_monitor = rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
@@ -347,12 +506,41 @@ fn main() -> Result<()> {
        }
    }
    Ok((
        pg,
        StartPostgresResult {
            delay_exit,
            compute,
            #[cfg(target_os = "linux")]
            rt,
            #[cfg(target_os = "linux")]
            token,
            #[cfg(target_os = "linux")]
            vm_monitor,
        },
    ))
 }
 type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
 struct StartPostgresResult {
    delay_exit: bool,
    // passed through from WaitSpecResult
    compute: Arc<ComputeNode>,
    #[cfg(target_os = "linux")]
    rt: Option<tokio::runtime::Runtime>,
    #[cfg(target_os = "linux")]
    token: tokio_util::sync::CancellationToken,
    #[cfg(target_os = "linux")]
    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
 }
 fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
        // Startup is finished, exit the startup tracing span
        drop(startup_context_guard);
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
@@ -367,6 +555,25 @@ fn main() -> Result<()> {
        exit_code = ecode.code()
    }
    Ok(WaitPostgresResult { exit_code })
 }
 struct WaitPostgresResult {
    exit_code: Option<i32>,
 }
 fn cleanup_after_postgres_exit(
    StartPostgresResult {
        mut delay_exit,
        compute,
        #[cfg(target_os = "linux")]
        vm_monitor,
        #[cfg(target_os = "linux")]
        token,
        #[cfg(target_os = "linux")]
        rt,
    }: StartPostgresResult,
 ) -> Result<bool> {
    // Terminate the vm_monitor so it releases the file watcher on
    // /sys/fs/cgroup/neon-postgres.
    // Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -408,13 +615,19 @@ fn main() -> Result<()> {
        error!("error while checking for core dumps: {err:?}");
    }
    Ok(delay_exit)
 }
 fn maybe_delay_exit(delay_exit: bool) {
    // If launch failed, keep serving HTTP requests for a while, so the cloud
    // control plane can get the actual error.
    if delay_exit {
        info!("giving control plane 30s to collect the error before shutdown");
        thread::sleep(Duration::from_secs(30));
    }
 }
 fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
    // Shutdown trace pipeline gracefully, so that it has a chance to send any
    // pending traces before we exit. Shutting down OTEL tracing provider may
    // hang for quite some time, see, for example:
@@ -526,6 +739,11 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
            Arg::new("resize-swap-on-bind")
                .long("resize-swap-on-bind")
                .action(clap::ArgAction::SetTrue),
        )
 }
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -0,0 +1,116 @@
 use compute_api::{
    responses::CatalogObjects,
    spec::{Database, Role},
 };
 use futures::Stream;
 use postgres::{Client, NoTls};
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
 use tokio::{
    io::{AsyncBufReadExt, BufReader},
    process::Command,
    task,
 };
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 use crate::{
    compute::ComputeNode,
    pg_helpers::{get_existing_dbs, get_existing_roles},
 };
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
    let connstr = compute.connstr.clone();
    task::spawn_blocking(move || {
        let mut client = Client::connect(connstr.as_str(), NoTls)?;
        let roles: Vec<Role>;
        {
            let mut xact = client.transaction()?;
            roles = get_existing_roles(&mut xact)?;
        }
        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
        Ok(CatalogObjects { roles, databases })
    })
    .await?
 }
 #[derive(Debug, thiserror::Error)]
 pub enum SchemaDumpError {
    #[error("Database does not exist.")]
    DatabaseDoesNotExist,
    #[error("Failed to execute pg_dump.")]
    IO(#[from] std::io::Error),
 }
 // It uses the pg_dump utility to dump the schema of the specified database.
 // The output is streamed back to the caller and supposed to be streamed via HTTP.
 //
 // Before return the result with the output, it checks that pg_dump produced any output.
 // If not, it tries to parse the stderr output to determine if the database does not exist
 // and special error is returned.
 //
 // To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
 pub async fn get_database_schema(
    compute: &Arc<ComputeNode>,
    dbname: &str,
 ) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
    let pgbin = &compute.pgbin;
    let basepath = Path::new(pgbin).parent().unwrap();
    let pgdump = basepath.join("pg_dump");
    let mut connstr = compute.connstr.clone();
    connstr.set_path(dbname);
    let mut cmd = Command::new(pgdump)
        .arg("--schema-only")
        .arg(connstr.as_str())
        .stdout(Stdio::piped())
        .stderr(Stdio::piped())
        .kill_on_drop(true)
        .spawn()?;
    let stdout = cmd.stdout.take().ok_or_else(|| {
        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
    })?;
    let stderr = cmd.stderr.take().ok_or_else(|| {
        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
    })?;
    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
    let stderr_reader = BufReader::new(stderr);
    let first_chunk = match stdout_reader.next().await {
        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
        Some(Err(e)) => {
            return Err(SchemaDumpError::IO(e));
        }
        _ => {
            let mut lines = stderr_reader.lines();
            if let Some(line) = lines.next_line().await? {
                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
                    return Err(SchemaDumpError::DatabaseDoesNotExist);
                }
                warn!("pg_dump stderr: {}", line)
            }
            tokio::spawn(async move {
                while let Ok(Some(line)) = lines.next_line().await {
                    warn!("pg_dump stderr: {}", line)
                }
            });
            return Err(SchemaDumpError::IO(std::io::Error::new(
                std::io::ErrorKind::Other,
                "failed to start pg_dump",
            )));
        }
    };
    let initial_stream = stream::once(Ok(first_chunk.freeze()));
    // Consume stderr and log warnings
    tokio::spawn(async move {
        let mut lines = stderr_reader.lines();
        while let Ok(Some(line)) = lines.next_line().await {
            warn!("pg_dump stderr: {}", line)
        }
    });
    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
 use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
 use utils::http::request::must_get_query_param;
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
    ComputeStatusResponse {
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }
        (&Method::GET, "/dbs_and_roles") => {
            info!("serving /dbs_and_roles GET request",);
            match get_dbs_and_roles(compute).await {
                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
                Err(_) => {
                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
                }
            }
        }
        (&Method::GET, "/database_schema") => {
            let database = match must_get_query_param(&req, "database") {
                Err(e) => return e.into_response(),
                Ok(database) => database,
            };
            info!("serving /database_schema GET request with database: {database}",);
            match get_database_schema(compute, &database).await {
                Ok(res) => render_plain(Body::wrap_stream(res)),
                Err(SchemaDumpError::DatabaseDoesNotExist) => {
                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
                }
                Err(e) => {
                    error!("can't get schema dump: {}", e);
                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
                }
            }
        }
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
    };
    Response::builder()
        .status(status)
        .header(CONTENT_TYPE, "application/json")
        .body(Body::from(serde_json::to_string(&error).unwrap()))
        .unwrap()
 }
 fn render_json(body: Body) -> Response<Body> {
    Response::builder()
        .header(CONTENT_TYPE, "application/json")
        .body(body)
        .unwrap()
 }
 fn render_plain(body: Body) -> Response<Body> {
    Response::builder()
        .header(CONTENT_TYPE, "text/plain")
        .body(body)
        .unwrap()
 }
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
    {
        let mut state = compute.state.lock().unwrap();
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,6 +68,51 @@ paths:
              schema:
                $ref: "#/components/schemas/Info"
  /dbs_and_roles:
    get:
      tags:
        - Info
      summary: Get databases and roles in the catalog.
      description: ""
      operationId: getDbsAndRoles
      responses:
        200:
          description: Compute schema objects
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/DbsAndRoles"
  /database_schema:
    get:
      tags:
        - Info
      summary: Get schema dump
      parameters:
        - name: database
          in: query
          description: Database name to dump.
          required: true
          schema:
            type: string
          example: "postgres"
      description: Get schema dump in SQL format.
      operationId: getDatabaseSchema
      responses:
        200:
          description: Schema dump
          content:
            text/plain:
              schema:
                type: string
                description: Schema dump in SQL format.
        404:
          description: Non existing database.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
  /check_writability:
    post:
      tags:
@@ -229,6 +274,73 @@ components:
        num_cpus:
          type: integer
    DbsAndRoles:
      type: object
      description: Databases and Roles
      required:
        - roles
        - databases
      properties:
        roles:
          type: array
          items:
            $ref: "#/components/schemas/Role"
        databases:
          type: array
          items:
            $ref: "#/components/schemas/Database"
    Database:
      type: object
      description: Database
      required:
        - name
        - owner
        - restrict_conn
        - invalid
      properties:
        name:
          type: string
        owner:
          type: string
        options:
          type: array
          items:
            $ref: "#/components/schemas/GenericOption"
        restrict_conn:
          type: boolean
        invalid:
          type: boolean
    Role:
      type: object
      description: Role
      required:
        - name
      properties:
        name:
          type: string
        encrypted_password:
          type: string
        options:
          type: array
          items:
            $ref: "#/components/schemas/GenericOption"
    GenericOption:
      type: object
      description: Schema Generic option
      required:
        - name
        - vartype
      properties:
        name:
          type: string
        value:
          type: string
        vartype:
          type: string
    ComputeState:
      type: object
      required:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,10 +8,12 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
 pub mod swap;
 pub mod sync_sk;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();
-                    if existing_dbs.get(&op.name).is_some() {
+                    if existing_dbs.contains_key(&op.name) {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -0,0 +1,45 @@
 use std::path::Path;
 use anyhow::{anyhow, Context};
 use tracing::warn;
 pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
 pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
    // run `/neonvm/bin/resize-swap --once {size_bytes}`
    //
    // Passing '--once' causes resize-swap to delete itself after successful completion, which
    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
    // postgres is running.
    //
    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
    let child_result = std::process::Command::new("/usr/bin/sudo")
        .arg(RESIZE_SWAP_BIN)
        .arg("--once")
        .arg(size_bytes.to_string())
        .spawn();
    child_result
        .context("spawn() failed")
        .and_then(|mut child| child.wait().context("wait() failed"))
        .and_then(|status| match status.success() {
            true => Ok(()),
            false => {
                // The command failed. Maybe it was because the resize-swap file doesn't exist?
                // The --once flag causes it to delete itself on success so we don't disable swap
                // while postgres is running; maybe this is fine.
                match Path::new(RESIZE_SWAP_BIN).try_exists() {
                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
                    // The path doesn't exist; we're actually ok 
                    Ok(false) => {
                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
                        Ok(())
                    },
                }
            }
        })
        // wrap any prior error with the overall context that we couldn't run the command
        .with_context(|| {
            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
        })
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,6 +17,7 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
@@ -27,6 +28,7 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
 tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,20 +9,23 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
+use control_plane::local_env::{
-use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
+    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
    SafekeeperConf,
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
@@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15";
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 fn default_conf(num_pageservers: u16) -> String {
    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
 control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
 "#,
    );
    for i in 0..num_pageservers {
        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
        template += &format!(
            r#"
 [[pageservers]]
 id = {pageserver_id}
 listen_pg_addr = '127.0.0.1:{pg_port}'
 listen_http_addr = '127.0.0.1:{http_port}'
 pg_auth_type = '{trust_auth}'
 http_auth_type = '{trust_auth}'
 "#,
            trust_auth = AuthType::Trust,
        )
    }
    template
 }
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -133,7 +98,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(sub_args, &env)),
+            "start" => rt.block_on(handle_start_all(&env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -152,7 +117,7 @@ fn main() -> Result<()> {
    };
    match subcommand_result {
-        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
+        Ok(Some(updated_env)) => updated_env.persist_config()?,
        Ok(None) => (),
        Err(e) => {
            eprintln!("command failed: {e:?}");
@@ -341,48 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match
+    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
-        .get_one::<u16>("num-pageservers")
+
-        .expect("num-pageservers arg has a default");
+    let force = init_match.get_one("force").expect("we set a default value");
-    // Create config file
+
-    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
+    // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
    let init_conf: NeonLocalInitConf = if let Some(config_path) =
        init_match.get_one::<PathBuf>("config")
    {
        // User (likely the Python test suite) provided a description of the environment.
        if num_pageservers.is_some() {
            bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
        }
        // load and parse the file
-        std::fs::read_to_string(config_path).with_context(|| {
+        let contents = std::fs::read_to_string(config_path).with_context(|| {
            format!(
                "Could not read configuration file '{}'",
                config_path.display()
            )
-        })?
+        })?;
        toml_edit::de::from_str(&contents)?
    } else {
-        // Built-in default config
+        // User (likely interactive) did not provide a description of the environment, give them the default
-        default_conf(*num_pageservers)
+        NeonLocalInitConf {
            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
            broker: NeonBroker {
                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
            },
            safekeepers: vec![SafekeeperConf {
                id: DEFAULT_SAFEKEEPER_ID,
                pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
                http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
                ..Default::default()
            }],
            pageservers: (0..num_pageservers.copied().unwrap_or(1))
                .map(|i| {
                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
                    NeonLocalInitPageserverConf {
                        id: pageserver_id,
                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
                        listen_http_addr: format!("127.0.0.1:{http_port}"),
                        pg_auth_type: AuthType::Trust,
                        http_auth_type: AuthType::Trust,
                        other: Default::default(),
                    }
                })
                .collect(),
            pg_distrib_dir: None,
            neon_distrib_dir: None,
            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
            storage_controller: None,
            control_plane_compute_hook_api: None,
        }
    };
-    let pg_version = init_match
+    LocalEnv::init(init_conf, force)
-        .get_one::<u32>("pg-version")
+        .context("materialize initial neon_local environment on disk")?;
-        .copied()
+    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
        .context("Failed to parse postgres version from the argument string")?;
    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
    let force = init_match.get_one("force").expect("we set a default value");
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;
    // Create remote storage location for default LocalFs remote storage
    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
            .initialize(&pageserver_config_overrides(init_match))
            .unwrap_or_else(|e| {
                eprintln!("pageserver init failed: {e:?}");
                exit(1);
            });
    }
    Ok(env)
 }
 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -397,15 +379,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
    PageServerNode::from_env(env, ps_conf)
 }
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
        .into_iter()
        .flatten()
        .map(String::as_str)
        .collect()
 }
 async fn handle_tenant(
    tenant_match: &ArgMatches,
    env: &mut local_env::LocalEnv,
@@ -417,6 +390,54 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
        Some(("import", import_match)) => {
            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
            let storage_controller = StorageController::from_env(env);
            let create_response = storage_controller.tenant_import(tenant_id).await?;
            let shard_zero = create_response
                .shards
                .first()
                .expect("Import response omitted shards");
            let attached_pageserver_id = shard_zero.node_id;
            let pageserver =
                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
            println!(
                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
            );
            let timelines = pageserver
                .http_client
                .list_timelines(shard_zero.shard_id)
                .await?;
            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
            let main_timeline = timelines
                .iter()
                .find(|t| t.ancestor_timeline_id.is_none())
                .expect("No timelines found")
                .timeline_id;
            let mut branch_i = 0;
            for timeline in timelines.iter() {
                let branch_name = if timeline.timeline_id == main_timeline {
                    "main".to_string()
                } else {
                    branch_i += 1;
                    format!("branch_{branch_i}")
                };
                println!(
                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
                    timeline.timeline_id
                );
                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
            }
        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -789,6 +810,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .copied()
                .unwrap_or(false);
            let allow_multiple = sub_args.get_flag("allow-multiple");
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -806,7 +829,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                _ => {}
            }
-            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+            if !allow_multiple {
                cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
            }
            cplane.new_endpoint(
                &endpoint_id,
@@ -835,6 +860,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
            let allow_multiple = sub_args.get_flag("allow-multiple");
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -860,11 +887,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .cloned()
                .unwrap_or_default();
-            cplane.check_conflicting_endpoints(
+            if !allow_multiple {
-                endpoint.mode,
+                cplane.check_conflicting_endpoints(
-                endpoint.tenant_id,
+                    endpoint.mode,
-                endpoint.timeline_id,
+                    endpoint.tenant_id,
-            )?;
+                    endpoint.timeline_id,
                )?;
            }
            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1020,10 +1049,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?
+            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1049,10 +1075,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }
-            if let Err(e) = pageserver
+            if let Err(e) = pageserver.start().await {
                .start(&pageserver_config_overrides(subcommand_args))
                .await
            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1179,7 +1202,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }
-async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically
    broker::start_broker_process(env).await?;
@@ -1196,10 +1219,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver
+        if let Err(e) = pageserver.start().await {
            .start(&pageserver_config_overrides(sub_match))
            .await
        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1340,13 +1360,6 @@ fn cli() -> Command {
        .required(false)
        .value_name("stop-mode");
    let pageserver_config_args = Arg::new("pageserver-config-override")
        .long("pageserver-config-override")
        .num_args(1)
        .action(ArgAction::Append)
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
@@ -1380,9 +1393,7 @@ fn cli() -> Command {
    let num_pageservers_arg = Arg::new("num-pageservers")
        .value_parser(value_parser!(u16))
        .long("num-pageservers")
-        .help("How many pageservers to create (default 1)")
+        .help("How many pageservers to create (default 1)");
        .required(false)
        .default_value("1");
    let update_catalog = Arg::new("update-catalog")
        .value_parser(value_parser!(bool))
@@ -1396,20 +1407,25 @@ fn cli() -> Command {
        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
        .required(false);
    let allow_multiple = Arg::new("allow-multiple")
        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
        .long("allow-multiple")
        .action(ArgAction::SetTrue)
        .required(false);
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
        .subcommand(
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
                        .required(false)
                        .value_parser(value_parser!(PathBuf))
-                        .value_name("config"),
+                        .value_name("config")
                )
                .arg(pg_version_arg.clone())
                .arg(force_arg)
@@ -1480,6 +1496,8 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1489,7 +1507,6 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1497,15 +1514,14 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .subcommand(Command::new("stop").about("Stop storage controller")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
@@ -1551,6 +1567,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg.clone())
                    .arg(hot_standby_arg.clone())
                    .arg(update_catalog)
                    .arg(allow_multiple.clone())
                )
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1559,6 +1576,7 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
                    .arg(allow_multiple.clone())
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
@@ -1610,7 +1628,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
                .arg(pageserver_config_args)
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,6 +554,7 @@ impl Endpoint {
            format_version: 1.0,
            operation_uuid: None,
            features: self.features.clone(),
            swap_size_bytes: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,7 +3,7 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use clap::ValueEnum;
 use postgres_backend::AuthType;
@@ -17,11 +17,14 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };
 use crate::pageserver::PageServerNode;
 use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -33,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
    // compute endpoints).
@@ -41,55 +44,99 @@ pub struct LocalEnv {
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
    // '.neon' if not given.
    #[serde(skip)]
    pub base_data_dir: PathBuf,
    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
    #[serde(default)]
    pub pg_distrib_dir: PathBuf,
    // Path to pageserver binary.
    #[serde(default)]
    pub neon_distrib_dir: PathBuf,
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
    pub default_tenant_id: Option<TenantId>,
    // used to issue tokens during e.g pg start
    #[serde(default)]
    pub private_key_path: PathBuf,
    pub broker: NeonBroker,
    // Configuration for the storage controller (1 per neon_local environment)
    pub storage_controller: NeonStorageControllerConf,
    /// This Vec must always contain at least one pageserver
    /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
    /// NB: not used anymore except for informing users that they need to change their `.neon/config`.
    pub pageservers: Vec<PageServerConf>,
    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,
    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
    #[serde(default)]
    pub control_plane_api: Option<Url>,
    // Control plane upcall API for storage controller.  If set, this will be propagated into the
    // storage controller's configuration.
    #[serde(default)]
    pub control_plane_compute_hook_api: Option<Url>,
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
    pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 /// On-disk state stored in `.neon/config`.
 #[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
 #[serde(default, deny_unknown_fields)]
 pub struct OnDiskConfig {
    pub pg_distrib_dir: PathBuf,
    pub neon_distrib_dir: PathBuf,
    pub default_tenant_id: Option<TenantId>,
    pub private_key_path: PathBuf,
    pub broker: NeonBroker,
    pub storage_controller: NeonStorageControllerConf,
    #[serde(
        skip_serializing,
        deserialize_with = "fail_if_pageservers_field_specified"
    )]
    pub pageservers: Vec<PageServerConf>,
    pub safekeepers: Vec<SafekeeperConf>,
    pub control_plane_api: Option<Url>,
    pub control_plane_compute_hook_api: Option<Url>,
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
 where
    D: serde::Deserializer<'de>,
 {
    Err(serde::de::Error::custom(
        "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
         Please remove the `pageservers` from your .neon/config.",
    ))
 }
 /// The description of the neon_local env to be initialized by `neon_local init --config`.
 #[derive(Clone, Debug, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct NeonLocalInitConf {
    // TODO: do we need this? Seems unused
    pub pg_distrib_dir: Option<PathBuf>,
    // TODO: do we need this? Seems unused
    pub neon_distrib_dir: Option<PathBuf>,
    pub default_tenant_id: TenantId,
    pub broker: NeonBroker,
    pub storage_controller: Option<NeonStorageControllerConf>,
    pub pageservers: Vec<NeonLocalInitPageserverConf>,
    pub safekeepers: Vec<SafekeeperConf>,
    pub control_plane_api: Option<Option<Url>>,
    pub control_plane_compute_hook_api: Option<Option<Url>>,
 }
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -98,6 +145,33 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
 pub struct NeonStorageControllerConf {
    /// Heartbeat timeout before marking a node offline
    #[serde(with = "humantime_serde")]
    pub max_unavailable: Duration,
    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,
 }
 impl NeonStorageControllerConf {
    // Use a shorter pageserver unavailability interval than the default to speed up tests.
    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
        std::time::Duration::from_secs(10);
 }
 impl Default for NeonStorageControllerConf {
    fn default() -> Self {
        Self {
            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
            split_threshold: None,
        }
    }
 }
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
@@ -113,22 +187,18 @@ impl NeonBroker {
    }
 }
 // neon_local needs to know this subset of pageserver configuration.
 // For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
 // It can get stale if `pageserver.toml` is changed.
 // TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
 }
 impl Default for PageServerConf {
@@ -139,8 +209,40 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
+        }
-            get_vectored_impl: None,
+    }
 }
 /// The toml that can be passed to `neon_local init --config`.
 /// This is a subset of the `pageserver.toml` configuration.
 // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
 #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
 pub struct NeonLocalInitPageserverConf {
    pub id: NodeId,
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
    #[serde(flatten)]
    pub other: HashMap<String, toml::Value>,
 }
 impl From<&NeonLocalInitPageserverConf> for PageServerConf {
    fn from(conf: &NeonLocalInitPageserverConf) -> Self {
        let NeonLocalInitPageserverConf {
            id,
            listen_pg_addr,
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
            other: _,
        } = conf;
        Self {
            id: *id,
            listen_pg_addr: listen_pg_addr.clone(),
            listen_http_addr: listen_http_addr.clone(),
            pg_auth_type: *pg_auth_type,
            http_auth_type: *http_auth_type,
        }
    }
 }
@@ -328,41 +430,7 @@ impl LocalEnv {
            .collect()
    }
-    /// Create a LocalEnv from a config file.
+    ///  Construct `Self` from on-disk state.
    ///
    /// Unlike 'load_config', this function fills in any defaults that are missing
    /// from the config file.
    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
        let mut env: LocalEnv = toml::from_str(toml)?;
        // Find postgres binaries.
        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
        // Note that later in the code we assume, that distrib dirs follow the same pattern
        // for all postgres versions.
        if env.pg_distrib_dir == Path::new("") {
            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
                env.pg_distrib_dir = postgres_bin.into();
            } else {
                let cwd = env::current_dir()?;
                env.pg_distrib_dir = cwd.join("pg_install")
            }
        }
        // Find neon binaries.
        if env.neon_distrib_dir == Path::new("") {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }
        if env.pageservers.is_empty() {
            anyhow::bail!("Configuration must contain at least one pageserver");
        }
        env.base_data_dir = base_path();
        Ok(env)
    }
    /// Locate and load config
    pub fn load_config() -> anyhow::Result<Self> {
        let repopath = base_path();
@@ -376,38 +444,129 @@ impl LocalEnv {
        // TODO: check that it looks like a neon repository
        // load and parse file
-        let config = fs::read_to_string(repopath.join("config"))?;
+        let config_file_contents = fs::read_to_string(repopath.join("config"))?;
-        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+        let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
        let mut env = {
            let OnDiskConfig {
                pg_distrib_dir,
                neon_distrib_dir,
                default_tenant_id,
                private_key_path,
                broker,
                storage_controller,
                pageservers,
                safekeepers,
                control_plane_api,
                control_plane_compute_hook_api,
                branch_name_mappings,
            } = on_disk_config;
            LocalEnv {
                base_data_dir: repopath.clone(),
                pg_distrib_dir,
                neon_distrib_dir,
                default_tenant_id,
                private_key_path,
                broker,
                storage_controller,
                pageservers,
                safekeepers,
                control_plane_api,
                control_plane_compute_hook_api,
                branch_name_mappings,
            }
        };
-        env.base_data_dir = repopath;
+        // The source of truth for pageserver configuration is the pageserver.toml.
        assert!(
            env.pageservers.is_empty(),
            "we ensure this during deserialization"
        );
        env.pageservers = {
            let iter = std::fs::read_dir(&repopath).context("open dir")?;
            let mut pageservers = Vec::new();
            for res in iter {
                let dentry = res?;
                const PREFIX: &str = "pageserver_";
                let dentry_name = dentry
                    .file_name()
                    .into_string()
                    .ok()
                    .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
                    .unwrap();
                if !dentry_name.starts_with(PREFIX) {
                    continue;
                }
                if !dentry.file_type().context("determine file type")?.is_dir() {
                    anyhow::bail!("expected a directory, got {:?}", dentry.path());
                }
                let id = dentry_name[PREFIX.len()..]
                    .parse::<NodeId>()
                    .with_context(|| format!("parse id from {:?}", dentry.path()))?;
                // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
                #[derive(serde::Serialize, serde::Deserialize)]
                // (allow unknown fields, unlike PageServerConf)
                struct PageserverConfigTomlSubset {
                    id: NodeId,
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    pg_auth_type: AuthType,
                    http_auth_type: AuthType,
                }
                let config_toml_path = dentry.path().join("pageserver.toml");
                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
                    &std::fs::read_to_string(&config_toml_path)
                        .with_context(|| format!("read {:?}", config_toml_path))?,
                )
                .context("parse pageserver.toml")?;
                let PageserverConfigTomlSubset {
                    id: config_toml_id,
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                } = config_toml;
                let conf = PageServerConf {
                    id: {
                        anyhow::ensure!(
                            config_toml_id == id,
                            "id mismatch: config_toml.id={config_toml_id} id={id}",
                        );
                        id
                    },
                    listen_pg_addr,
                    listen_http_addr,
                    pg_auth_type,
                    http_auth_type,
                };
                pageservers.push(conf);
            }
            pageservers
        };
        Ok(env)
    }
-    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
+    pub fn persist_config(&self) -> anyhow::Result<()> {
-        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
+        Self::persist_config_impl(
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+            &self.base_data_dir,
-        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
+            &OnDiskConfig {
-        // a bit sad.
+                pg_distrib_dir: self.pg_distrib_dir.clone(),
-        let mut conf_content = r#"# This file describes a local deployment of the page server
+                neon_distrib_dir: self.neon_distrib_dir.clone(),
-# and safekeeeper node. It is read by the 'neon_local' command-line
+                default_tenant_id: self.default_tenant_id,
-# utility.
+                private_key_path: self.private_key_path.clone(),
-"#
+                broker: self.broker.clone(),
-        .to_string();
+                storage_controller: self.storage_controller.clone(),
-
+                pageservers: vec![], // it's skip_serializing anyway
-        // Convert the LocalEnv to a toml file.
+                safekeepers: self.safekeepers.clone(),
-        //
+                control_plane_api: self.control_plane_api.clone(),
-        // This could be as simple as this:
+                control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
-        //
+                branch_name_mappings: self.branch_name_mappings.clone(),
-        // conf_content += &toml::to_string_pretty(env)?;
+            },
-        //
+        )
-        // But it results in a "values must be emitted before tables". I'm not sure
+    }
        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
        // Maybe rust reorders the fields to squeeze avoid padding or something?
        // In any case, converting to toml::Value first, and serializing that, works.
        // See https://github.com/alexcrichton/toml-rs/issues/142
        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
    pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
        let conf_content = &toml::to_string_pretty(config)?;
        let target_config_path = base_path.join("config");
        fs::write(&target_config_path, conf_content).with_context(|| {
            format!(
@@ -432,17 +591,13 @@ impl LocalEnv {
        }
    }
-    //
+    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
-    // Initialize a new Neon repository
+    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
-    //
+        let base_path = base_path();
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
+        assert_ne!(base_path, Path::new(""));
-        // check if config already exists
+        let base_path = &base_path;
        let base_path = &self.base_data_dir;
        ensure!(
            base_path != Path::new(""),
            "repository base path is missing"
        );
        // create base_path dir
        if base_path.exists() {
            match force {
                InitForceMode::MustNotExist => {
@@ -474,70 +629,96 @@ impl LocalEnv {
                }
            }
        }
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
                self.pg_bin_dir(pg_version)?.display()
            );
        }
        for binary in ["pageserver", "safekeeper"] {
            if !self.neon_distrib_dir.join(binary).exists() {
                bail!(
                    "Can't find binary '{binary}' in neon distrib dir '{}'",
                    self.neon_distrib_dir.display()
                );
            }
        }
        if !base_path.exists() {
            fs::create_dir(base_path)?;
        }
        let NeonLocalInitConf {
            pg_distrib_dir,
            neon_distrib_dir,
            default_tenant_id,
            broker,
            storage_controller,
            pageservers,
            safekeepers,
            control_plane_api,
            control_plane_compute_hook_api,
        } = conf;
        // Find postgres binaries.
        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
        // Note that later in the code we assume, that distrib dirs follow the same pattern
        // for all postgres versions.
        let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
                postgres_bin.into()
            } else {
                let cwd = env::current_dir().unwrap();
                cwd.join("pg_install")
            }
        });
        // Find neon binaries.
        let neon_distrib_dir = neon_distrib_dir
            .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
        // Generate keypair for JWT.
        //
        // The keypair is only needed if authentication is enabled in any of the
        // components. For convenience, we generate the keypair even if authentication
        // is not enabled, so that you can easily enable it after the initialization
-        // step. However, if the key generation fails, we treat it as non-fatal if
+        // step.
-        // authentication was not enabled.
+        generate_auth_keys(
-        if self.private_key_path == PathBuf::new() {
+            base_path.join("auth_private_key.pem").as_path(),
-            match generate_auth_keys(
+            base_path.join("auth_public_key.pem").as_path(),
-                base_path.join("auth_private_key.pem").as_path(),
+        )
-                base_path.join("auth_public_key.pem").as_path(),
+        .context("generate auth keys")?;
-            ) {
+        let private_key_path = PathBuf::from("auth_private_key.pem");
-                Ok(()) => {
+
-                    self.private_key_path = PathBuf::from("auth_private_key.pem");
+        // create the runtime type because the remaining initialization code below needs
-                }
+        // a LocalEnv instance op operation
-                Err(e) => {
+        // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
-                    if !self.auth_keys_needed() {
+        let env = LocalEnv {
-                        eprintln!("Could not generate keypair for JWT authentication: {e}");
+            base_data_dir: base_path.clone(),
-                        eprintln!("Continuing anyway because authentication was not enabled");
+            pg_distrib_dir,
-                        self.private_key_path = PathBuf::from("auth_private_key.pem");
+            neon_distrib_dir,
-                    } else {
+            default_tenant_id: Some(default_tenant_id),
-                        return Err(e);
+            private_key_path,
-                    }
+            broker,
-                }
+            storage_controller: storage_controller.unwrap_or_default(),
-            }
+            pageservers: pageservers.iter().map(Into::into).collect(),
            safekeepers,
            control_plane_api: control_plane_api.unwrap_or_default(),
            control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
            branch_name_mappings: Default::default(),
        };
        // create endpoints dir
        fs::create_dir_all(env.endpoints_path())?;
        // create safekeeper dirs
        for safekeeper in &env.safekeepers {
            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
        }
-        fs::create_dir_all(self.endpoints_path())?;
+        // initialize pageserver state
-
+        for (i, ps) in pageservers.into_iter().enumerate() {
-        for safekeeper in &self.safekeepers {
+            let runtime_ps = &env.pageservers[i];
-            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
+            assert_eq!(&PageServerConf::from(&ps), runtime_ps);
            fs::create_dir(env.pageserver_data_dir(ps.id))?;
            PageServerNode::from_env(&env, runtime_ps)
                .initialize(ps)
                .context("pageserver init failed")?;
        }
-        self.persist_config(base_path)
+        // setup remote remote location for default LocalFs remote storage
-    }
+        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-    fn auth_keys_needed(&self) -> bool {
+        env.persist_config()
        self.pageservers.iter().any(|ps| {
            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }
-fn base_path() -> PathBuf {
+pub fn base_path() -> PathBuf {
    match std::env::var_os("NEON_REPO_DIR") {
        Some(val) => PathBuf::from(val),
        None => PathBuf::from(".neon"),
@@ -580,31 +761,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
    }
    Ok(())
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn simple_conf_parsing() {
        let simple_conf_toml = include_str!("../simple.conf");
        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
        assert!(
            simple_conf_parse_result.is_ok(),
            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
        );
        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
        assert!(
            spoiled_url_toml.contains(spoiled_url_str),
            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
        );
        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
        assert!(
            spoiled_url_parse_result.is_err(),
            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
        );
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -4,21 +4,21 @@
 //!
 //!   .neon/
 //!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::Command;
+use std::str::FromStr;
 use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use pageserver_api::models::{
-    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
    TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -30,7 +30,7 @@ use utils::{
    lsn::Lsn,
 };
-use crate::local_env::PageServerConf;
+use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
 use crate::{background_process, local_env::LocalEnv};
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -74,57 +74,23 @@ impl PageServerNode {
        }
    }
-    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    fn pageserver_init_make_toml(
-    ///
+        &self,
-    /// These all end up on the command line of the `pageserver` binary.
+        conf: NeonLocalInitPageserverConf,
-    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+    ) -> anyhow::Result<toml_edit::Document> {
        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );
        let PageServerConf {
            id,
            listen_pg_addr,
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
        } = &self.conf;
        let id = format!("id={}", id);
        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
        } else {
            String::new()
        };
        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
            format!("get_vectored_impl='{get_vectored_impl}'")
        } else {
            String::new()
        };
        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
-        let mut overrides = vec![
+        let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
            id,
            pg_distrib_dir_param,
            http_auth_type_param,
            pg_auth_type_param,
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
        ];
        if let Some(control_plane_api) = &self.env.control_plane_api {
            overrides.push(format!(
@@ -134,7 +100,7 @@ impl PageServerNode {
            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -143,31 +109,40 @@ impl PageServerNode {
            }
        }
-        if !cli_overrides
+        if !conf.other.contains_key("remote_storage") {
            .iter()
            .any(|c| c.starts_with("remote_storage"))
        {
            overrides.push(format!(
                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
            ));
        }
-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
        // Apply the user-provided overrides
-        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+        overrides.push(
            toml_edit::ser::to_string_pretty(&conf)
                .expect("we deserialized this from toml earlier"),
        );
-        overrides
+        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
        let mut config_toml = toml_edit::Document::new();
        for fragment_str in overrides {
            let fragment = toml_edit::Document::from_str(&fragment_str)
                .expect("all fragments in `overrides` are valid toml documents, this function controls that");
            for (key, item) in fragment.iter() {
                config_toml.insert(key, item.clone());
            }
        }
        Ok(config_toml)
    }
    /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
-        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
+        self.pageserver_init(conf)
        self.pageserver_init(config_overrides)
            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
    }
@@ -183,11 +158,11 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    pub async fn start(&self) -> anyhow::Result<()> {
-        self.start_node(config_overrides, false).await
+        self.start_node().await
    }
-    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
        let datadir = self.repo_path();
        let node_id = self.conf.id;
        println!(
@@ -198,29 +173,20 @@ impl PageServerNode {
        );
        io::stdout().flush()?;
-        if !datadir.exists() {
+        let config = self
-            std::fs::create_dir(&datadir)?;
+            .pageserver_init_make_toml(conf)
-        }
+            .context("make pageserver toml")?;
-
+        let config_file_path = datadir.join("pageserver.toml");
-        let datadir_path_str = datadir.to_str().with_context(|| {
+        let mut config_file = std::fs::OpenOptions::new()
-            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
+            .create_new(true)
-        })?;
+            .write(true)
-        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+            .open(&config_file_path)
-        args.push(Cow::Borrowed("--init"));
+            .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
-
+        config_file
-        let init_output = Command::new(self.env.pageserver_bin())
+            .write_all(config.to_string().as_bytes())
-            .args(args.iter().map(Cow::as_ref))
+            .context("write pageserver toml")?;
-            .envs(self.pageserver_env_variables()?)
+        drop(config_file);
-            .output()
+        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
            .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
        anyhow::ensure!(
            init_output.status.success(),
            "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
            node_id,
            String::from_utf8_lossy(&init_output.stdout),
            String::from_utf8_lossy(&init_output.stderr),
        );
        // Write metadata file, used by pageserver on startup to register itself with
        // the storage controller
@@ -234,12 +200,13 @@ impl PageServerNode {
        // situation: the metadata is written by some other script.
        std::fs::write(
            metadata_path,
-            serde_json::to_vec(&serde_json::json!({
+            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
-                "host": "localhost",
+                postgres_host: "localhost".to_string(),
-                "port": self.pg_connection_config.port(),
+                postgres_port: self.pg_connection_config.port(),
-                "http_host": "localhost",
+                http_host: "localhost".to_string(),
-                "http_port": http_port,
+                http_port,
-            }))
+                other: HashMap::new(),
            })
            .unwrap(),
        )
        .expect("Failed to write metadata file");
@@ -247,11 +214,7 @@ impl PageServerNode {
        Ok(())
    }
-    async fn start_node(
+    async fn start_node(&self) -> anyhow::Result<()> {
        &self,
        config_overrides: &[&str],
        update_config: bool,
    ) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -268,15 +231,12 @@ impl PageServerNode {
                self.conf.id, datadir,
            )
        })?;
-        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        let args = vec!["-D", datadir_path_str];
        if update_config {
            args.push(Cow::Borrowed("--update-config"));
        }
        background_process::start_process(
            "pageserver",
            &datadir,
            &self.env.pageserver_bin(),
-            args.iter().map(Cow::as_ref),
+            args,
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
@@ -293,22 +253,6 @@ impl PageServerNode {
        Ok(())
    }
    fn pageserver_basic_args<'a>(
        &self,
        config_overrides: &'a [&'a str],
        datadir_path_str: &'a str,
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
        }
        args
    }
    fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
@@ -434,6 +378,11 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
            switch_aux_file_policy: settings
                .remove("switch_aux_file_policy")
                .map(|x| x.parse::<AuxFilePolicy>())
                .transpose()
                .context("Failed to parse 'switch_aux_file_policy'")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -552,6 +501,11 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
                switch_aux_file_policy: settings
                    .remove("switch_aux_file_policy")
                    .map(|x| x.parse::<AuxFilePolicy>())
                    .transpose()
                    .context("Failed to parse 'switch_aux_file_policy'")?,
            }
        };
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,6 +1,8 @@
-use crate::{background_process, local_env::LocalEnv};
+use crate::{
    background_process,
    local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper::Method;
 use pageserver_api::{
    controller_api::{
        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -14,6 +16,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -32,15 +35,13 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
    config: NeonStorageControllerConf,
 }
 const COMMAND: &str = "storage_controller";
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 // Use a shorter pageserver unavailability interval than the default to speed up tests.
 const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -135,6 +136,7 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
            config: env.storage_controller.clone(),
        }
    }
@@ -241,9 +243,13 @@ impl StorageController {
                anyhow::bail!("initdb failed with status {status}");
            }
            // Write a minimal config file:
            // - Specify the port, since this is chosen dynamically
            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
            //   the storage controller we don't want a slow local disk to interfere with that.
            tokio::fs::write(
                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
            )
            .await?;
        };
@@ -272,8 +278,6 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;
        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +287,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &max_unavailable.to_string(),
+            &humantime::Duration::from(self.config.max_unavailable).to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -305,6 +309,10 @@ impl StorageController {
            ));
        }
        if let Some(split_threshold) = self.config.split_threshold.as_ref() {
            args.push(format!("--split-threshold={split_threshold}"))
        }
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
@@ -379,7 +387,7 @@ impl StorageController {
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
-        method: hyper::Method,
+        method: reqwest::Method,
        path: String,
        body: Option<RQ>,
    ) -> anyhow::Result<RS>
@@ -472,6 +480,16 @@ impl StorageController {
            .await
    }
    #[instrument(skip(self))]
    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
        self.dispatch::<(), TenantCreateResponse>(
            Method::POST,
            format!("debug/v1/tenant/{tenant_id}/import"),
            None,
        )
        .await
    }
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,7 +1,6 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
@@ -14,7 +13,7 @@ use pageserver_api::{
    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
+use reqwest::{Method, StatusCode, Url};
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
@@ -232,7 +231,7 @@ impl Client {
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
-        method: hyper::Method,
+        method: Method,
        path: String,
        body: Option<RQ>,
    ) -> mgmt_api::Result<RS>
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,8 +8,6 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
 # XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
 set -eux -o pipefail
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.
-The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
 The `diesel` crate is used for defining models & migrations.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
-use crate::spec::ComputeSpec;
+use crate::spec::{ComputeSpec, Database, Role};
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
    pub total_ext_download_size: u64,
 }
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct CatalogObjects {
    pub roles: Vec<Role>,
    pub databases: Vec<Database>,
 }
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,6 +33,23 @@ pub struct ComputeSpec {
    #[serde(default)]
    pub features: Vec<ComputeFeature>,
    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
    /// received.
    ///
    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
    /// spec generation doesn't need to be aware of the actual compute it's running on, while
    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
    /// giving every VM much more swap than it should have (32GiB).
    ///
    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
    /// enabling the swap resizing behavior once rollout is complete.
    ///
    /// See neondatabase/cloud#12047 for more.
    #[serde(default)]
    pub swap_size_bytes: Option<u64>,
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
    #[cfg(target_os = "macos")]
    {
        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
    }
    #[cfg(not(target_os = "macos"))]
    {
        MAXRSS_KB.set(rusage_stats.ru_maxrss);
    }
 }
 fn get_rusage_stats() -> libc::rusage {
@@ -471,6 +480,15 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
        let id = self.vec.with_labels(labels);
        self.vec.remove_metric(id)
    }
    pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
        let id = self.vec.with_labels(labels);
        let metric = self.vec.get_metric(id);
        let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
        let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
        inc.saturating_sub(dec)
    }
 }
 impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -0,0 +1,31 @@
 use std::collections::HashMap;
 use const_format::formatcp;
 #[cfg(test)]
 mod tests;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
 // plane and/or storage controller.
 //
 #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
 pub struct NodeMetadata {
    #[serde(rename = "host")]
    pub postgres_host: String,
    #[serde(rename = "port")]
    pub postgres_port: u16,
    pub http_host: String,
    pub http_port: u16,
    // Deployment tools may write fields to the metadata file beyond what we
    // use in this type: this type intentionally only names fields that require.
    #[serde(flatten)]
    pub other: HashMap<String, serde_json::Value>,
 }
--- a/libs/pageserver_api/src/config/tests.rs
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -0,0 +1,22 @@
 use super::*;
 #[test]
 fn test_node_metadata_v1_backward_compatibilty() {
    let v1 = serde_json::to_vec(&serde_json::json!({
        "host": "localhost",
        "port": 23,
        "http_host": "localhost",
        "http_port": 42,
    }));
    assert_eq!(
        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
        NodeMetadata {
            postgres_host: "localhost".to_string(),
            postgres_port: 23,
            http_host: "localhost".to_string(),
            http_port: 42,
            other: HashMap::new(),
        }
    )
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,5 +1,6 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -21,15 +22,107 @@ pub struct Key {
    pub field6: u32,
 }
 /// The storage key size.
 pub const KEY_SIZE: usize = 18;
 /// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 /// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 /// The (reserved) key prefix of relation sizes.
 pub const RELATION_SIZE_PREFIX: u8 = 0x61;
 /// The key prefix of AUX file keys.
 pub const AUX_KEY_PREFIX: u8 = 0x62;
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
 }
 impl Key {
    /// Check if the key falls in the range of metadata keys.
    pub const fn is_metadata_key(&self) -> bool {
        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
    }
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
        assert!(is_metadata_key_slice(key), "key not in metadata key range");
        Key {
            field1: key[0],
            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
            field5: key[11],
            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
        }
    }
    /// Encode a metadata key to a storage key.
    pub fn from_metadata_key(key: &[u8]) -> Self {
        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
    }
    /// Extract a metadata key to a writer. The result should always be 16 bytes.
    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
        writer.put_u8(self.field1);
        assert!(self.field2 <= 0xFFFF);
        writer.put_u16(self.field2 as u16);
        writer.put_u32(self.field3);
        writer.put_u32(self.field4);
        writer.put_u8(self.field5);
        writer.put_u32(self.field6);
    }
    /// Get the range of metadata keys.
    pub const fn metadata_key_range() -> Range<Self> {
        Key {
            field1: METADATA_KEY_BEGIN_PREFIX,
            field2: 0,
            field3: 0,
            field4: 0,
            field5: 0,
            field6: 0,
        }..Key {
            field1: METADATA_KEY_END_PREFIX,
            field2: 0,
            field3: 0,
            field4: 0,
            field5: 0,
            field6: 0,
        }
    }
    /// Get the range of aux keys.
    pub fn metadata_aux_key_range() -> Range<Self> {
        Key {
            field1: AUX_KEY_PREFIX,
            field2: 0,
            field3: 0,
            field4: 0,
            field5: 0,
            field6: 0,
        }..Key {
            field1: AUX_KEY_PREFIX + 1,
            field2: 0,
            field3: 0,
            field4: 0,
            field5: 0,
            field6: 0,
        }
    }
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0xf) as i128) << 120)
+        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -39,7 +132,7 @@ impl Key {
    pub const fn from_i128(x: i128) -> Self {
        Key {
-            field1: ((x >> 120) & 0xf) as u8,
+            field1: ((x >> 120) & 0x7F) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
            field3: (x >> 72) as u32,
            field4: (x >> 40) as u32,
@@ -48,11 +141,11 @@ impl Key {
        }
    }
-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
        self.add(1)
    }
-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
        let mut key = *self;
        let r = key.field6.overflowing_add(x);
@@ -81,6 +174,8 @@ impl Key {
        key
    }
    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -92,6 +187,8 @@ impl Key {
        }
    }
    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -475,12 +572,17 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
 pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
 }
 #[inline(always)]
@@ -556,11 +658,14 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;
    use crate::key::is_metadata_key_slice;
    use crate::key::Key;
    use rand::Rng;
    use rand::SeedableRng;
    use super::AUX_KEY_PREFIX;
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -576,4 +681,16 @@ mod tests {
        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
    #[test]
    fn test_metadata_keys() {
        let mut metadata_key = vec![AUX_KEY_PREFIX];
        metadata_key.extend_from_slice(&[0xFF; 15]);
        let encoded_key = Key::from_metadata_key(&metadata_key);
        let mut output_key = Vec::new();
        encoded_key.extract_metadata_key_to_writer(&mut output_key);
        assert_eq!(metadata_key, output_key);
        assert!(encoded_key.is_metadata_key());
        assert!(is_metadata_key_slice(&metadata_key));
    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,10 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;
-use crate::key::Key;
+use crate::{
    key::Key,
    shard::{ShardCount, ShardIdentity},
 };
 use itertools::Itertools;
 ///
@@ -14,44 +17,279 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }
-impl KeySpace {
+/// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
 /// Represents a contiguous half-open range of the keyspace, masked according to a particular
 /// ShardNumber's stripes: within this range of keys, only some "belong" to the current
 /// shard.
 ///
 /// When we iterate over keys within this object, we will skip any keys that don't belong
 /// to this shard.
 ///
 /// The start + end keys may not belong to the shard: these specify where layer files should
 /// start  + end, but we will never actually read/write those keys.
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct ShardedRange<'a> {
    pub shard_identity: &'a ShardIdentity,
    pub range: Range<Key>,
 }
 // Calculate the size of a range within the blocks of the same relation, or spanning only the
 // top page in the previous relation's space.
 fn contiguous_range_len(range: &Range<Key>) -> u32 {
    debug_assert!(is_contiguous_range(range));
    if range.start.field6 == 0xffffffff {
        range.end.field6 + 1
    } else {
        range.end.field6 - range.start.field6
    }
 }
 /// Return true if this key range includes only keys in the same relation's data blocks, or
 /// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
 ///
 /// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
 /// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
 /// of a given contiguous range is present on one shard.
 ///
 /// This matters, because:
 /// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
 /// - Within such ranges, we may calculate distances using simple subtraction of field6.
 fn is_contiguous_range(range: &Range<Key>) -> bool {
    range.start.field1 == range.end.field1
        && range.start.field2 == range.end.field2
        && range.start.field3 == range.end.field3
        && range.start.field4 == range.end.field4
        && (range.start.field5 == range.end.field5
            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
 }
 impl<'a> ShardedRange<'a> {
    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
        Self {
            shard_identity,
            range,
        }
    }
    /// Break up this range into chunks, each of which has at least one local key in it if the
    /// total range has at least one local key.
    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
        // Optimization for single-key case (e.g. logical size keys)
        if self.range.end == self.range.start.add(1) {
            return vec![(
                if self.shard_identity.is_key_disposable(&self.range.start) {
                    0
                } else {
                    1
                },
                self.range,
            )];
        }
        if !is_contiguous_range(&self.range) {
            // Ranges that span relations are not fragmented.  We only get these ranges as a result
            // of operations that act on existing layers, so we trust that the existing range is
            // reasonably small.
            return vec![(u32::MAX, self.range)];
        }
        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
        let mut cursor = self.range.start;
        while cursor < self.range.end {
            let advance_by = self.distance_to_next_boundary(cursor);
            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
            // If the previous fragment is undersized, then we seek to consume enough
            // blocks to complete it.
            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
                Some(frag) => {
                    // Prev block is complete, want the full number.
                    (
                        target_nblocks,
                        if is_fragment_disposable {
                            // If this current range will be empty (not shard-local data), we will merge into previous
                            Some(frag)
                        } else {
                            None
                        },
                    )
                }
                None => {
                    // First iteration, want the full number
                    (target_nblocks, None)
                }
            };
            let advance_by = if is_fragment_disposable {
                advance_by
            } else {
                std::cmp::min(advance_by, want_blocks)
            };
            let next_cursor = cursor.add(advance_by);
            let this_frag = (
                if is_fragment_disposable {
                    0
                } else {
                    advance_by
                },
                cursor..next_cursor,
            );
            cursor = next_cursor;
            if let Some(last_fragment) = merge_last_fragment {
                // Previous fragment was short or this one is empty, merge into it
                last_fragment.0 += this_frag.0;
                last_fragment.1.end = this_frag.1.end;
            } else {
                fragments.push(this_frag);
            }
        }
        fragments
    }
    /// Estimate the physical pages that are within this range, on this shard.  This returns
    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
    pub fn page_count(&self) -> u32 {
        // Special cases for single keys like logical sizes
        if self.range.end == self.range.start.add(1) {
            return if self.shard_identity.is_key_disposable(&self.range.start) {
                0
            } else {
                1
            };
        }
        // We can only do an authentic calculation of contiguous key ranges
        if !is_contiguous_range(&self.range) {
            return u32::MAX;
        }
        // Special case for single sharded tenants: our logical and physical sizes are the same
        if self.shard_identity.count < ShardCount::new(2) {
            return contiguous_range_len(&self.range);
        }
        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
        // to Self, and add the stripe's block count to our total if so.
        let mut result: u64 = 0;
        let mut cursor = self.range.start;
        while cursor < self.range.end {
            // Count up to the next stripe_size boundary or end of range
            let advance_by = self.distance_to_next_boundary(cursor);
            // If this blocks in this stripe belong to us, add them to our count
            if !self.shard_identity.is_key_disposable(&cursor) {
                result += advance_by as u64;
            }
            cursor = cursor.add(advance_by);
        }
        if result > u32::MAX as u64 {
            u32::MAX
        } else {
            result as u32
        }
    }
    /// Advance the cursor to the next potential fragment boundary: this is either
    /// a stripe boundary, or the end of the range.
    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
        if self.shard_identity.count < ShardCount::new(2) {
            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
            return distance_to_range_end;
        }
        if cursor.field6 == 0xffffffff {
            // We are wrapping from one relation's logical size to the next relation's first data block
            return 1;
        }
        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
        let stripe_remainder = self.shard_identity.stripe_size.0
            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
        if cfg!(debug_assertions) {
            // We should never overflow field5 and field6 -- our callers check this earlier
            // and would have returned their u32::MAX cases if the input range violated this.
            let next_cursor = cursor.add(stripe_remainder);
            debug_assert!(
                next_cursor.field1 == cursor.field1
                    && next_cursor.field2 == cursor.field2
                    && next_cursor.field3 == cursor.field3
                    && next_cursor.field4 == cursor.field4
                    && next_cursor.field5 == cursor.field5
            )
        }
        std::cmp::min(stripe_remainder, distance_to_range_end)
    }
    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
    /// this function simply calculates the number of pages in the space, without accounting for those
    /// pages that would not actually be stored on this node.
    ///
    /// Don't use this function in code that works with physical entities like layer files.
    pub fn raw_size(range: &Range<Key>) -> u32 {
        if is_contiguous_range(range) {
            contiguous_range_len(range)
        } else {
            u32::MAX
        }
    }
 }
 impl KeySpace {
    /// Create a key space with a single range.
    pub fn single(key_range: Range<Key>) -> Self {
        Self {
            ranges: vec![key_range],
        }
    }
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as usize;
+        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
        let mut parts = Vec::new();
        let mut current_part = Vec::new();
        let mut current_part_size: usize = 0;
        for range in &self.ranges {
-            // If appending the next contiguous range in the keyspace to the current
+            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
-            // partition would cause it to be too large, start a new partition.
+            // will respect shard striping rather than assuming all keys within a range are present.
-            let this_size = key_range_size(range) as usize;
+            let range = ShardedRange::new(range.clone(), shard_identity);
            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
                parts.push(KeySpace {
                    ranges: current_part,
                });
                current_part = Vec::new();
                current_part_size = 0;
            }
-            // If the next range is larger than 'target_size', split it into
+            // Chunk up the range into parts that each contain up to target_size local blocks
-            // 'target_size' chunks.
+            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
-            let mut remain_size = this_size;
+                // If appending the next contiguous range in the keyspace to the current
-            let mut start = range.start;
+                // partition would cause it to be too large, and our current partition
-            while remain_size > target_nblocks {
+                // covers at least one block that is physically present in this shard,
-                let next = start.add(target_nblocks as u32);
+                // then start a new partition
-                parts.push(KeySpace {
+                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
-                    ranges: vec![start..next],
+                    && current_part_size > 0
-                });
+                {
-                start = next;
+                    parts.push(KeySpace {
-                remain_size -= target_nblocks
+                        ranges: current_part,
                    });
                    current_part = Vec::new();
                    current_part_size = 0;
                }
                current_part.push(frag_range.start..frag_range.end);
                current_part_size += frag_on_shard_size as usize;
            }
            current_part.push(start..range.end);
            current_part_size += remain_size;
        }
        // add last partition that wasn't full yet.
@@ -64,8 +302,12 @@ impl KeySpace {
        KeyPartitioning { parts }
    }
    pub fn is_empty(&self) -> bool {
        self.total_raw_size() == 0
    }
    /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
    pub fn merge(&mut self, other: &KeySpace) {
        let all_ranges = self
            .ranges
@@ -94,12 +336,13 @@ impl KeySpace {
    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return;
+                return KeySpace::default();
            }
        };
@@ -112,30 +355,37 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);
        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();
                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
        removed_accum.to_keyspace()
    }
    pub fn start(&self) -> Option<Key> {
@@ -146,11 +396,11 @@ impl KeySpace {
        self.ranges.last().map(|range| range.end)
    }
-    #[allow(unused)]
+    /// The size of the keyspace in pages, before accounting for sharding
-    pub fn total_size(&self) -> usize {
+    pub fn total_raw_size(&self) -> usize {
        self.ranges
            .iter()
-            .map(|range| key_range_size(range) as usize)
+            .map(|range| ShardedRange::raw_size(range) as usize)
            .sum()
    }
@@ -170,6 +420,11 @@ impl KeySpace {
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
        self.overlaps_at(range).is_some()
    }
    /// Check if the keyspace contains a key
    pub fn contains(&self, key: &Key) -> bool {
        self.overlaps(&(*key..key.next()))
    }
 }
 ///
@@ -184,10 +439,33 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }
 /// Represents a partitioning of the sparse key space.
 #[derive(Clone, Debug, Default)]
 pub struct SparseKeyPartitioning {
    pub parts: Vec<SparseKeySpace>,
 }
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
    /// Convert a key partitioning to a sparse partition.
    pub fn into_sparse(self) -> SparseKeyPartitioning {
        SparseKeyPartitioning {
            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
        }
    }
 }
 impl SparseKeyPartitioning {
    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
    /// cause long/dead loops.
    pub fn into_dense(self) -> KeyPartitioning {
        KeyPartitioning {
            parts: self.parts.into_iter().map(|x| x.0).collect(),
        }
    }
 }
 ///
@@ -219,7 +497,7 @@ impl KeySpaceAccum {
    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += key_range_size(&range) as u64;
+        self.size += ShardedRange::raw_size(&range) as u64;
        match self.accum.as_mut() {
            Some(accum) => {
@@ -251,7 +529,9 @@ impl KeySpaceAccum {
        std::mem::take(self).to_keyspace()
    }
-    pub fn size(&self) -> u64 {
+    // The total number of keys in this object, ignoring any sharding effects that might cause some of
    // the keys to be omitted in storage on this shard.
    pub fn raw_size(&self) -> u64 {
        self.size
    }
 }
@@ -307,36 +587,19 @@ impl KeySpaceRandomAccum {
    }
 }
 #[inline(always)]
 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
    let start = key_range.start;
    let end = key_range.end;
    if end.field1 != start.field1
        || end.field2 != start.field2
        || end.field3 != start.field3
        || end.field4 != start.field4
    {
        return u32::MAX;
    }
    let start = (start.field5 as u64) << 32 | start.field6 as u64;
    let end = (end.field5 as u64) << 32 | end.field6 as u64;
    let diff = end - start;
    if diff > u32::MAX as u64 {
        u32::MAX
    } else {
        diff as u32
    }
 }
 pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }
 #[cfg(test)]
 mod tests {
    use rand::{RngCore, SeedableRng};
    use crate::{
        models::ShardParameters,
        shard::{ShardCount, ShardNumber},
    };
    use super::*;
    use std::fmt::Write;
@@ -379,14 +642,17 @@ mod tests {
            accum.add_range(range.clone());
        }
-        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        let expected_size: u64 = ranges
-        assert_eq!(accum.size(), expected_size);
+            .iter()
            .map(|r| ShardedRange::raw_size(r) as u64)
            .sum();
        assert_eq!(accum.raw_size(), expected_size);
        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
        assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.size(), 0);
+        assert_eq!(accum.raw_size(), 0);
        for range in &ranges {
            accum.add_range(range.clone());
@@ -553,7 +819,16 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(2)..Key::from_i128(3),
                Key::from_i128(6)..Key::from_i128(7),
                Key::from_i128(11)..Key::from_i128(12),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -583,7 +858,17 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(3)..Key::from_i128(5),
                Key::from_i128(8)..Key::from_i128(10),
                Key::from_i128(14)..Key::from_i128(15),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -610,7 +895,11 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace::default();
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -637,7 +926,17 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
        let removed = key_space1.remove_overlapping_with(&key_space2);
        let removed_expected = KeySpace {
            ranges: vec![
                Key::from_i128(9)..Key::from_i128(10),
                Key::from_i128(12)..Key::from_i128(15),
                Key::from_i128(17)..Key::from_i128(19),
            ],
        };
        assert_eq!(removed, removed_expected);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -650,4 +949,412 @@ mod tests {
            ]
        );
    }
    #[test]
    fn sharded_range_relation_gap() {
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        let range = ShardedRange::new(
            Range {
                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
            },
            &shard_identity,
        );
        // Key range spans relations, expect MAX
        assert_eq!(range.page_count(), u32::MAX);
    }
    #[test]
    fn shard_identity_keyspaces_single_key() {
        let shard_identity = ShardIdentity::new(
            ShardNumber(1),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        let range = ShardedRange::new(
            Range {
                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
            },
            &shard_identity,
        );
        // Single-key range on logical size key
        assert_eq!(range.page_count(), 1);
    }
    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
    #[test]
    fn contiguous_range_check() {
        assert!(!is_contiguous_range(
            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
        ),);
        // The ranges goes all the way up to the 0xffffffff, including it: this is
        // not considered a rel block range because 0xffffffff stores logical sizes,
        // not blocks.
        assert!(!is_contiguous_range(
            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
        ),);
        // Keys within the normal data region of a relation
        assert!(is_contiguous_range(
            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
        ),);
        // The logical size key of one forkno, then some blocks in the next
        assert!(is_contiguous_range(
            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
        ),);
    }
    #[test]
    fn shard_identity_keyspaces_forkno_gap() {
        let shard_identity = ShardIdentity::new(
            ShardNumber(1),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        let range = ShardedRange::new(
            Range {
                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
            },
            &shard_identity,
        );
        // Range spanning the end of one forkno and the start of the next: we do not attempt to
        // calculate a valid size, because we have no way to know if they keys between start
        // and end are actually in use.
        assert_eq!(range.page_count(), u32::MAX);
    }
    #[test]
    fn shard_identity_keyspaces_one_relation() {
        for shard_number in 0..4 {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                ShardCount::new(4),
                ShardParameters::DEFAULT_STRIPE_SIZE,
            )
            .unwrap();
            let range = ShardedRange::new(
                Range {
                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
                },
                &shard_identity,
            );
            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
            if shard_number == 0 {
                assert_eq!(range.page_count(), 1);
            } else {
                // Other shards should perceive the range's size as zero
                assert_eq!(range.page_count(), 0);
            }
        }
    }
    /// Test helper: construct a ShardedRange and call fragment() on it, returning
    /// the total page count in the range and the fragments.
    fn do_fragment(
        range_start: Key,
        range_end: Key,
        shard_identity: &ShardIdentity,
        target_nblocks: u32,
    ) -> (u32, Vec<(u32, Range<Key>)>) {
        let range = ShardedRange::new(
            Range {
                start: range_start,
                end: range_end,
            },
            shard_identity,
        );
        let page_count = range.page_count();
        let fragments = range.fragment(target_nblocks);
        // Invariant: we always get at least one fragment
        assert!(!fragments.is_empty());
        // Invariant: the first/last fragment start/end should equal the input start/end
        assert_eq!(fragments.first().unwrap().1.start, range_start);
        assert_eq!(fragments.last().unwrap().1.end, range_end);
        if page_count > 0 {
            // Invariant: every fragment must contain at least one shard-local page, if the
            // total range contains at least one shard-local page
            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
            if !all_nonzero {
                eprintln!("Found a zero-length fragment: {:?}", fragments);
            }
            assert!(all_nonzero);
        } else {
            // A range with no shard-local pages should always be returned as a single fragment
            assert_eq!(fragments, vec![(0, range_start..range_end)]);
        }
        // Invariant: fragments must be ordered and non-overlapping
        let mut last: Option<Range<Key>> = None;
        for frag in &fragments {
            if let Some(last) = last {
                assert!(frag.1.start >= last.end);
                assert!(frag.1.start > last.start);
            }
            last = Some(frag.1.clone())
        }
        // Invariant: fragments respect target_nblocks
        for frag in &fragments {
            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
        }
        (page_count, fragments)
    }
    /// Really simple tests for fragment(), on a range that just contains a single stripe
    /// for a single tenant.
    #[test]
    fn sharded_range_fragment_simple() {
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        // A range which we happen to know covers exactly one stripe which belongs to this shard
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
        // Ask for stripe_size blocks, we get the whole stripe
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 32768),
            (32768, vec![(32768, input_start..input_end)])
        );
        // Ask for more, we still get the whole stripe
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 10000000),
            (32768, vec![(32768, input_start..input_end)])
        );
        // Ask for target_nblocks of half the stripe size, we get two halves
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 16384),
            (
                32768,
                vec![
                    (16384, input_start..input_start.add(16384)),
                    (16384, input_start.add(16384)..input_end)
                ]
            )
        );
    }
    #[test]
    fn sharded_range_fragment_multi_stripe() {
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
        // Ask for all the blocks, get a fragment that covers the whole range but reports
        // its size to be just the blocks belonging to our shard.
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 131072),
            (32768, vec![(32768, input_start..input_end)])
        );
        // Ask for a sub-stripe quantity
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 16000),
            (
                32768,
                vec![
                    (16000, input_start..input_start.add(16000)),
                    (16000, input_start.add(16000)..input_start.add(32000)),
                    (768, input_start.add(32000)..input_end),
                ]
            )
        );
        // Try on a range that starts slightly after our owned stripe
        assert_eq!(
            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
            (32767, vec![(32767, input_start.add(1)..input_end)])
        );
    }
    /// Test our calculations work correctly when we start a range from the logical size key of
    /// a previous relation.
    #[test]
    fn sharded_range_fragment_starting_from_logical_size() {
        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x10000),
            (0x8001, vec![(0x8001, input_start..input_end)])
        );
        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
        // store all logical sizes)
        let shard_identity = ShardIdentity::new(
            ShardNumber(1),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x10000),
            (0x1, vec![(0x1, input_start..input_end)])
        );
    }
    /// Test that ShardedRange behaves properly when used on un-sharded data
    #[test]
    fn sharded_range_fragment_unsharded() {
        let shard_identity = ShardIdentity::unsharded();
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x8000),
            (
                0x10000,
                vec![
                    (0x8000, input_start..input_start.add(0x8000)),
                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
                ]
            )
        );
    }
    #[test]
    fn sharded_range_fragment_cross_relation() {
        let shard_identity = ShardIdentity::unsharded();
        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x8000),
            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
        );
        // Same, but using a sharded identity
        let shard_identity = ShardIdentity::new(
            ShardNumber(0),
            ShardCount::new(4),
            ShardParameters::DEFAULT_STRIPE_SIZE,
        )
        .unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 0x8000),
            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
        );
    }
    #[test]
    fn sharded_range_fragment_tiny_nblocks() {
        let shard_identity = ShardIdentity::unsharded();
        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
        assert_eq!(
            do_fragment(input_start, input_end, &shard_identity, 16),
            (
                0x38,
                vec![
                    (16, input_start..input_start.add(16)),
                    (16, input_start.add(16)..input_start.add(32)),
                    (16, input_start.add(32)..input_start.add(48)),
                    (8, input_start.add(48)..input_end),
                ]
            )
        );
    }
    #[test]
    fn sharded_range_fragment_fuzz() {
        // Use a fixed seed: we don't want to explicitly pick values, but we do want
        // the test to be reproducible.
        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
        for _i in 0..1000 {
            let shard_identity = if prng.next_u32() % 2 == 0 {
                ShardIdentity::unsharded()
            } else {
                let shard_count = prng.next_u32() % 127 + 1;
                ShardIdentity::new(
                    ShardNumber((prng.next_u32() % shard_count) as u8),
                    ShardCount::new(shard_count as u8),
                    ShardParameters::DEFAULT_STRIPE_SIZE,
                )
                .unwrap()
            };
            let target_nblocks = prng.next_u32() % 65536 + 1;
            let start_offset = prng.next_u32() % 16384;
            // Try ranges up to 4GiB in size, that are always at least 1
            let range_size = prng.next_u32() % 8192 + 1;
            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
                .unwrap()
                .add(start_offset);
            let input_end = input_start.add(range_size);
            // This test's main success conditions are the invariants baked into do_fragment
            let (_total_size, fragments) =
                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
            // Pick a random key within the range and check it appears in the output
            let example_key = input_start.add(prng.next_u32() % range_size);
            // Panic on unwrap if it isn't found
            let example_key_frag = fragments
                .iter()
                .find(|f| f.1.contains(&example_key))
                .unwrap();
            // Check that the fragment containing our random key has a nonzero size if
            // that key is shard-local
            let example_key_local = !shard_identity.is_key_disposable(&example_key);
            if example_key_local {
                assert!(example_key_frag.0 > 0);
            }
        }
    }
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,6 +1,5 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;
 pub mod controller_api;
 pub mod key;
@@ -11,7 +10,4 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub mod config;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,3 +1,4 @@
 pub mod detach_ancestor;
 pub mod partitioning;
 pub mod utilization;
@@ -8,6 +9,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -159,6 +161,22 @@ impl std::fmt::Debug for TenantState {
    }
 }
 /// A temporary lease to a specific lsn inside a timeline.
 /// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
 #[serde_as]
 #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
 pub struct LsnLease {
    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
    pub valid_until: SystemTime,
 }
 serde_with::serde_conv!(
    SystemTimeAsRfc3339Millis,
    SystemTime,
    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
 );
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -287,7 +305,7 @@ pub struct TenantConfig {
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -303,6 +321,103 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
    pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }
 /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
 /// tenant config. When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
 ///
 /// Unset -> V1
 ///       -> V2
 ///       -> CrossValidation -> V2
 #[derive(
    Eq,
    PartialEq,
    Debug,
    Copy,
    Clone,
    strum_macros::EnumString,
    strum_macros::Display,
    serde_with::DeserializeFromStr,
    serde_with::SerializeDisplay,
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum AuxFilePolicy {
    /// V1 aux file policy: store everything in AUX_FILE_KEY
    #[strum(ascii_case_insensitive)]
    V1,
    /// V2 aux file policy: store in the AUX_FILE keyspace
    #[strum(ascii_case_insensitive)]
    V2,
    /// Cross validation runs both formats on the write path and does validation
    /// on the read path.
    #[strum(ascii_case_insensitive)]
    CrossValidation,
 }
 impl AuxFilePolicy {
    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
        matches!(
            (from, to),
            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
        )
    }
    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
    pub fn default_tenant_config() -> Self {
        Self::V1
    }
 }
 /// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
 pub struct AtomicAuxFilePolicy(AtomicUsize);
 impl AtomicAuxFilePolicy {
    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
        Self(AtomicUsize::new(
            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
        ))
    }
    pub fn load(&self) -> Option<AuxFilePolicy> {
        match self.0.load(std::sync::atomic::Ordering::Acquire) {
            0 => None,
            other => Some(AuxFilePolicy::from_usize(other)),
        }
    }
    pub fn store(&self, policy: Option<AuxFilePolicy>) {
        self.0.store(
            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
            std::sync::atomic::Ordering::Release,
        );
    }
 }
 impl AuxFilePolicy {
    pub fn to_usize(self) -> usize {
        match self {
            Self::V1 => 1,
            Self::CrossValidation => 2,
            Self::V2 => 3,
        }
    }
    pub fn try_from_usize(this: usize) -> Option<Self> {
        match this {
            1 => Some(Self::V1),
            2 => Some(Self::CrossValidation),
            3 => Some(Self::V2),
            _ => None,
        }
    }
    pub fn from_usize(this: usize) -> Self {
        Self::try_from_usize(this).unwrap()
    }
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -323,13 +438,28 @@ impl EvictionPolicy {
    }
 }
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
-#[serde(tag = "kind")]
+    Eq,
    PartialEq,
    Debug,
    Copy,
    Clone,
    strum_macros::EnumString,
    strum_macros::Display,
    serde_with::DeserializeFromStr,
    serde_with::SerializeDisplay,
 )]
 #[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
    Legacy,
    Tiered,
 }
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -429,7 +559,6 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -578,6 +707,9 @@ pub struct TimelineInfo {
    pub state: TimelineState,
    pub walreceiver_status: String,
    /// The last aux file policy being used on this timeline
    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -684,6 +816,8 @@ pub enum HistoricLayerInfo {
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
        l0: bool,
    },
    Image {
        layer_file_name: String,
@@ -719,6 +853,16 @@ impl HistoricLayerInfo {
        };
        *field = value;
    }
    pub fn layer_file_size(&self) -> u64 {
        match self {
            HistoricLayerInfo::Delta {
                layer_file_size, ..
            } => *layer_file_size,
            HistoricLayerInfo::Image {
                layer_file_size, ..
            } => *layer_file_size,
        }
    }
 }
 #[derive(Debug, Serialize, Deserialize)]
@@ -726,6 +870,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
 }
 #[derive(Debug, Serialize, Deserialize)]
 pub struct IngestAuxFilesRequest {
    pub aux_files: HashMap<String, String>,
 }
 #[derive(Debug, Serialize, Deserialize)]
 pub struct ListAuxFilesRequest {
    pub lsn: Lsn,
 }
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
    pub task_id: String,
@@ -750,9 +904,6 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
    pub pid: u32,
    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
    pub kind: Cow<'static, str>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -780,6 +931,66 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantScanRemoteStorageShard {
    pub tenant_shard_id: TenantShardId,
    pub generation: Option<u32>,
 }
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantScanRemoteStorageResponse {
    pub shards: Vec<TenantScanRemoteStorageShard>,
 }
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(rename_all = "snake_case")]
 pub enum TenantSorting {
    ResidentSize,
    MaxLogicalSize,
 }
 impl Default for TenantSorting {
    fn default() -> Self {
        Self::ResidentSize
    }
 }
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct TopTenantShardsRequest {
    // How would you like to sort the tenants?
    pub order_by: TenantSorting,
    // How many results?
    pub limit: usize,
    // Omit tenants with more than this many shards (e.g. if this is the max number of shards
    // that the caller would ever split to)
    pub where_shards_lt: Option<ShardCount>,
    // Omit tenants where the ordering metric is less than this (this is an optimization to
    // let us quickly exclude numerous tiny shards)
    pub where_gt: Option<u64>,
 }
 #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub struct TopTenantShardItem {
    pub id: TenantShardId,
    /// Total size of layers on local disk for all timelines in this tenant
    pub resident_size: u64,
    /// Total size of layers in remote storage for all timelines in this tenant
    pub physical_size: u64,
    /// The largest logical size of a timeline within this tenant
    pub max_logical_size: u64,
 }
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TopTenantShardsResponse {
    pub shards: Vec<TopTenantShardItem>,
 }
 pub mod virtual_file {
    #[derive(
        Copy,
@@ -847,39 +1058,72 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }
 // In the V2 protocol version, a GetPage request contains two LSN values:
 //
 // request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
 // "get the latest version present". It's used by the primary server, which knows that no one else
 // is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
 // Lsn::Max. Standby servers use the current replay LSN as the request LSN.
 //
 // not_modified_since: Hint to the pageserver that the client knows that the page has not been
 // modified between 'not_modified_since' and the request LSN. It's always correct to set
 // 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
 // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
 // request without waiting for 'request_lsn' to arrive.
 //
 // The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
 // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
 // 'latest' was set to true. The V2 interface was added because there was no correct way for a
 // standby to request a page at a particular non-latest LSN, and also include the
 // 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
 // request, if the standby knows that the page hasn't been modified since, and risk getting an error
 // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
 // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
 // interface allows sending both LSNs, and let the pageserver do the right thing. There is no
 // difference in the responses between V1 and V2.
 //
 // The Request structs below reflect the V2 interface. If V1 is used, the parse function
 // maps the old format requests to the new format.
 //
 #[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
    V1,
    V2,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub latest: bool,
+    pub request_lsn: Lsn,
-    pub lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub rel: RelTag,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub latest: bool,
+    pub request_lsn: Lsn,
-    pub lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub rel: RelTag,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub latest: bool,
+    pub request_lsn: Lsn,
-    pub lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub latest: bool,
+    pub request_lsn: Lsn,
-    pub lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub dbnode: u32,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub latest: bool,
+    pub request_lsn: Lsn,
-    pub lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub kind: u8,
    pub segno: u32,
 }
@@ -926,14 +1170,16 @@ pub struct TenantHistorySize {
 }
 impl PagestreamFeMessage {
    /// Serialize a compute -> pageserver message. This is currently only used in testing
    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();
        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -942,8 +1188,8 @@ impl PagestreamFeMessage {
            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -952,8 +1198,8 @@ impl PagestreamFeMessage {
            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -963,15 +1209,15 @@ impl PagestreamFeMessage {
            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.dbnode);
            }
            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
@@ -980,18 +1226,40 @@ impl PagestreamFeMessage {
        bytes.into()
    }
-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(
-        // TODO these gets can fail
+        body: &mut R,
-
+        protocol_version: PagestreamProtocolVersion,
    ) -> anyhow::Result<PagestreamFeMessage> {
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
        let (request_lsn, not_modified_since) = match protocol_version {
            PagestreamProtocolVersion::V2 => (
                Lsn::from(body.read_u64::<BigEndian>()?),
                Lsn::from(body.read_u64::<BigEndian>()?),
            ),
            PagestreamProtocolVersion::V1 => {
                // In the old protocol, each message starts with a boolean 'latest' flag,
                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
                // 'not_modified_since', used in the new protocol version.
                let latest = body.read_u8()? != 0;
                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
                if latest {
                    (Lsn::MAX, request_lsn) // get latest version
                } else {
                    (request_lsn, request_lsn) // get version at specified LSN
                }
            }
        };
        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.read_u8()? != 0,
+                request_lsn,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1000,8 +1268,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.read_u8()? != 0,
+                request_lsn,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1010,8 +1278,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.read_u8()? != 0,
+                request_lsn,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1021,14 +1289,14 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.read_u8()? != 0,
+                request_lsn,
-                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                not_modified_since,
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    latest: body.read_u8()? != 0,
+                    request_lsn,
-                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    not_modified_since,
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
@@ -1148,6 +1416,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
    use serde_json::json;
    use std::str::FromStr;
    use super::*;
@@ -1156,8 +1425,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: true,
+                request_lsn: Lsn(4),
-                lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1166,8 +1435,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: false,
+                request_lsn: Lsn(4),
-                lsn: Lsn(4),
+                not_modified_since: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1176,8 +1445,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: true,
+                request_lsn: Lsn(4),
-                lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1187,14 +1456,16 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: true,
+                request_lsn: Lsn(4),
-                lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
                    .unwrap();
            assert!(msg == reconstructed);
        }
    }
@@ -1353,4 +1624,69 @@ mod tests {
            assert_eq!(actual, expected, "example on {line}");
        }
    }
    #[test]
    fn test_aux_file_migration_path() {
        assert!(AuxFilePolicy::is_valid_migration_path(
            None,
            AuxFilePolicy::V1
        ));
        assert!(AuxFilePolicy::is_valid_migration_path(
            None,
            AuxFilePolicy::V2
        ));
        assert!(AuxFilePolicy::is_valid_migration_path(
            None,
            AuxFilePolicy::CrossValidation
        ));
        // Self-migration is not a valid migration path, and the caller should handle it by itself.
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V1),
            AuxFilePolicy::V1
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V2),
            AuxFilePolicy::V2
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::CrossValidation),
            AuxFilePolicy::CrossValidation
        ));
        // Migrations not allowed
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::CrossValidation),
            AuxFilePolicy::V1
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V1),
            AuxFilePolicy::V2
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V2),
            AuxFilePolicy::V1
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V2),
            AuxFilePolicy::CrossValidation
        ));
        assert!(!AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::V1),
            AuxFilePolicy::CrossValidation
        ));
        // Migrations allowed
        assert!(AuxFilePolicy::is_valid_migration_path(
            Some(AuxFilePolicy::CrossValidation),
            AuxFilePolicy::V2
        ));
    }
    #[test]
    fn test_aux_parse() {
        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
        assert_eq!(
            AuxFilePolicy::from_str("cross-validation").unwrap(),
            AuxFilePolicy::CrossValidation
        );
    }
 }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -0,0 +1,6 @@
 use utils::id::TimelineId;
 #[derive(Default, serde::Serialize)]
 pub struct AncestorDetached {
    pub reparented_timelines: Vec<TimelineId>,
 }
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,9 +1,11 @@
 use utils::lsn::Lsn;
 use crate::keyspace::SparseKeySpace;
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-
+    pub sparse_keys: crate::keyspace::SparseKeySpace,
    pub at_lsn: Lsn,
 }
@@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
        map.serialize_key("sparse_keys")?;
        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -133,6 +139,12 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
            "sparse_keys": [
              [
                "620000000000000000000000000000000000",
                "620000000000000000000000000000000003"
              ]
            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
@@ -96,7 +97,7 @@ impl ShardCount {
    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as `TenantShardId::unsharded`.
+    /// as [`TenantShardId::unsharded`].
    ///
    /// This method returns the actual number of shards, i.e. if our internal value is
    /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -115,14 +116,16 @@ impl ShardCount {
        self.0
    }
-    ///
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
    /// uses the legacy format for `TenantShardId`. See also the documentation for
    /// [`Self::count`].
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
    /// [`Self::literal`] would return.
-    pub fn new(val: u8) -> Self {
+    pub const fn new(val: u8) -> Self {
        Self(val)
    }
 }
@@ -450,7 +453,7 @@ impl ShardIdentity {
    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
    /// tenants.  Modern single-shard tenants should not use this: they should
    /// have number=0 count=1.
-    pub fn unsharded() -> Self {
+    pub const fn unsharded() -> Self {
        Self {
            number: ShardNumber(0),
            count: ShardCount(0),
@@ -556,6 +559,14 @@ impl ShardIdentity {
        }
    }
    /// Obtains the shard number and count combined into a `ShardIndex`.
    pub fn shard_index(&self) -> ShardIndex {
        ShardIndex {
            shard_count: self.count,
            shard_number: self.number,
        }
    }
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -649,7 +660,13 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
    // because they must be included in basebackups.
    let is_initfork = key.field5 == INIT_FORKNUM;
    !is_rel_block_key(key) || is_initfork
 }
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        Ok(ProcessMsgResult::Continue)
    }
-    /// Log as info/error result of handling COPY stream and send back
+    /// - Log as info/error result of handling COPY stream and send back
-    /// ErrorResponse if that makes sense. Shutdown the stream if we got
+    ///   ErrorResponse if that makes sense.
-    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
+    /// - Shutdown the stream if we got Terminate.
-    /// close.
+    /// - Then close the connection because we don't handle exiting from COPY
    ///   stream normally.
    pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
        use CopyStreamHandlerEnd::*;
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            }
        }
        if let Terminate = &end {
            self.state = ProtoState::Closed;
        }
        let err_to_send_and_errcode = match &end {
            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
            Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                error!("failed to send ErrorResponse: {}", ee);
            }
        }
        // Proper COPY stream finishing to continue using the connection is not
        // implemented at the server side (we don't need it so far). To prevent
        // further usages of the connection, close it.
        self.framed.shutdown().await.ok();
        self.state = ProtoState::Closed;
    }
 }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
    }
 }
 impl fmt::Display for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // The password is intentionally hidden and not part of this display string.
        write!(f, "postgresql://{}:{}", self.host, self.port)
    }
 }
 impl fmt::Debug for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
 pub use v14::bindings::{CheckPoint, ControlFileData};
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -331,7 +331,10 @@ impl CheckPoint {
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        let mut new_xid = std::cmp::max(
            xid.wrapping_add(1),
            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
        );
        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
        new_xid =
@@ -367,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);
    let first_page_only = seg_off < XLOG_BLCKSZ;
-    let (shdr_rem_len, infoflags) = if first_page_only {
+    // If first records starts in the middle of the page, pretend in page header
-        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
+    // there is a fake record which ends where first real record starts. This
    // makes pg_waldump etc happy.
    let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
        assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
        // xlp_rem_len doesn't include page header, hence the subtraction.
        (
            seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
            pg_constants::XLP_FIRST_IS_CONTRECORD,
        )
    } else {
        (0, 0)
    };
@@ -397,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
    if !first_page_only {
        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
        // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
        let (xlp_rem_len, xlp_info) = if page_off > 0 {
            assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
            (
                (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
                pg_constants::XLP_FIRST_IS_CONTRECORD,
            )
        } else {
            (0, 0)
        };
        let header = XLogPageHeaderData {
            xlp_magic: XLOG_PAGE_MAGIC as u16,
-            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+            xlp_info,
                pg_constants::XLP_FIRST_IS_CONTRECORD
            } else {
                0
            },
            xlp_tli: PG_TLI,
            xlp_pageaddr: lsn.page_lsn().0,
-            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+            xlp_rem_len,
                page_off as u32
            } else {
                0u32
            },
            ..Default::default() // Put 0 in padding fields.
        };
        let hdr_bytes = header.encode()?;
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }
-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
-    // because pg_current_wal_insert_lsn skips page headers.
+    // returns the position just after the page header on the next page. That's where the next
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // record will be inserted. But the page header hasn't actually been written to the WAL
    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
    // error. Because of that, if the insert location is just after a page header, back off to
    // previous page boundary.
    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
    }
    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
    Ok(intermediate_lsns)
 }
@@ -320,61 +332,70 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        client.execute("CREATE table t(x int)", &[])?;
-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
-        // We will use logical message as the padding. We start with detecting how much WAL
+        // will use carefully-sized logical messages to advance WAL insert location such
-        // it takes for one logical message, considering all alignments and headers.
+        // that there is just enough space on the page for the XLOG_SWITCH record.
-        let base_wal_advance = {
+        loop {
            // We start with measuring how much WAL it takes for one logical message,
            // considering all alignments and headers.
            let before_lsn = client.pg_current_wal_insert_lsn()?;
            // Small non-empty message bigger than few bytes is more likely than an empty
            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
                + XLOG_SIZE_OF_XLOG_RECORD
        };
        let mut remaining_lsn =
            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
        if remaining_lsn < base_wal_advance {
            remaining_lsn += XLOG_BLCKSZ;
        }
        let repeats = 10 + remaining_lsn - base_wal_advance;
        info!(
            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
            client.pg_current_wal_insert_lsn()?,
            remaining_lsn,
            base_wal_advance,
            repeats
        );
        client.execute(
            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
            &[&(repeats as i32)],
        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
            XLOG_SIZE_OF_XLOG_RECORD
        );
-        // Emit the XLOG_SWITCH
+            // Did the record cross a page boundary? If it did, start over. Crossing a
-        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+            // page boundary adds to the apparent size of the record because of the page
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+            // header, which throws off the calculation.
-        let next_segment = PgLsn::from(0x0200_0000);
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
-        ensure!(
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
-            xlog_switch_record_end < next_segment,
+            {
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+                continue;
-            xlog_switch_record_end,
+            }
-            next_segment
+            // base_size is the size of a logical message without the payload
-        );
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
-        ensure!(
+
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            // Is there enough space on the page for another logical message and an
-            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            // XLOG_SWITCH? If not, start over.
-            xlog_switch_record_end,
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
-        );
+                continue;
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+            }
            // We will write another logical message, such that after the logical message
            // record, there will be space for exactly one XLOG_SWITCH. How large should
            // the logical message's payload be? An XLOG_SWITCH record has no data => its
            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                &[&(repeats as i32)],
            )?;
            info!(
                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
                client.pg_current_wal_insert_lsn()?,
                XLOG_SIZE_OF_XLOG_RECORD
            );
            // Emit the XLOG_SWITCH
            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
            let xlog_switch_record_end: PgLsn =
                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
                != XLOG_SIZE_OF_XLOG_SHORT_PHD
            {
                warn!(
                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
                    xlog_switch_record_end,
                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
                );
                continue;
            }
            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
        }
    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,6 +38,7 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,6 +3,7 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
 use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -20,6 +21,7 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
 use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -128,12 +130,12 @@ impl AzureBlobStorage {
        let kind = RequestKind::Get;
        let _permit = self.permit(kind, cancel).await?;
        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let mut etag = None;
        let mut last_modified = None;
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let download = async {
            let response = builder
@@ -152,39 +154,46 @@ impl AzureBlobStorage {
                Err(_elapsed) => Err(DownloadError::Timeout),
            });
-            let mut response = std::pin::pin!(response);
+            let mut response = Box::pin(response);
-            let mut bufs = Vec::new();
+            let Some(part) = response.next().await else {
            while let Some(part) = response.next().await {
                let part = part?;
                if etag.is_none() {
                    etag = Some(part.blob.properties.etag);
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
                }
                if let Some(blob_meta) = part.blob.metadata {
                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
                }
                let data = part
                    .data
                    .collect()
                    .await
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
            if bufs.is_empty() {
                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
+                    "Azure GET response contained no response body"
                )));
            };
            let part = part?;
            if etag.is_none() {
                etag = Some(part.blob.properties.etag);
            }
            if last_modified.is_none() {
                last_modified = Some(part.blob.properties.last_modified.into());
            }
            if let Some(blob_meta) = part.blob.metadata {
                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
            }
            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
            let etag = etag.unwrap();
            let last_modified = last_modified.unwrap();
            let tail_stream = response
                .map(|part| match part {
                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
                    Err(e) => {
                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
                    }
                })
                .flatten();
            let stream = part
                .data
                .map(|r| r.map_err(io::Error::other))
                .chain(sync_wrapper::SyncStream::new(tail_stream));
            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
            Ok(Download {
-                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                download_stream: Box::pin(download_stream),
                etag,
                last_modified,
                metadata: Some(StorageMetadata(metadata)),
@@ -193,7 +202,10 @@ impl AzureBlobStorage {
        tokio::select! {
            bufs = download => bufs,
-            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
            },
        }
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,11 +21,13 @@ use std::{
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
    str::FromStr,
    sync::Arc,
    time::{Duration, SystemTime},
 };
 use anyhow::{bail, Context};
 use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};
 use bytes::Bytes;
@@ -53,11 +55,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// We set this a little bit low as we currently buffer the entire file into RAM
+/// Set this limit analogously to the S3 limit
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -134,6 +136,11 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
    pub fn add_trailing_slash(&self) -> Self {
        // Unwrap safety inputs are guararnteed to be valid UTF-8
        Self(format!("{}/", self.0).try_into().unwrap())
    }
 }
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +164,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    ///
-    /// so this method doesnt need to.
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
-    async fn list_prefixes(
+    /// from the absolute root of the bucket.
-        &self,
+    ///
-        prefix: Option<&RemotePath>,
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
-        cancel: &CancellationToken,
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
-        let result = self
+    /// returned in `keys` ().
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
+    ///
-            .await?
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
-            .prefixes;
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
-        Ok(result)
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
    /// because it is for listing files instead of listing
    /// names sharing common prefixes.
    /// For example,
    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
    ///
    /// max_keys limits max number of keys returned; None means unlimited.
    async fn list_files(
        &self,
        prefix: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
            .await?
            .keys;
        Ok(result)
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -336,41 +317,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
    //
    // max_keys limits max number of keys returned; None means unlimited.
    pub async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
        }
    }
    // lists common *prefixes*, if any of files
    // Example:
    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
        }
    }
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -619,6 +565,7 @@ pub struct S3Config {
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
    pub concurrency_limit: NonZeroUsize,
    pub max_keys_per_list_response: Option<i32>,
    pub upload_storage_class: Option<StorageClass>,
 }
 impl Debug for S3Config {
@@ -747,6 +694,18 @@ impl RemoteStorageConfig {
                    endpoint,
                    concurrency_limit,
                    max_keys_per_list_response,
                    upload_storage_class: toml
                        .get("upload_storage_class")
                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
                            let storage_class = StorageClass::from_str(&s).expect("infallible");
                            #[allow(deprecated)]
                            if matches!(storage_class, StorageClass::Unknown(_)) {
                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
                            }
                            Ok(storage_class)
                        })
                        .transpose()?,
                })
            }
            (_, _, _, Some(_), None) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.
 use std::{
-    borrow::Cow,
+    collections::HashSet,
    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };
@@ -22,11 +20,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
+use utils::crashsafe::path_with_suffix_extension;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {
    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
        fn get_all_files<'a, P>(
            directory_path: P,
        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
        where
            P: AsRef<Utf8Path> + Send + Sync + 'a,
        {
            Box::pin(async move {
                let directory_path = directory_path.as_ref();
                if directory_path.exists() {
                    if directory_path.is_dir() {
                        let mut paths = Vec::new();
                        let mut dir_contents = fs::read_dir(directory_path).await?;
                        while let Some(dir_entry) = dir_contents.next_entry().await? {
                            let file_type = dir_entry.file_type().await?;
                            let entry_path =
                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                                    anyhow::Error::msg(format!(
                                        "non-Unicode path: {}",
                                        pb.to_string_lossy()
                                    ))
                                })?;
                            if file_type.is_symlink() {
                                tracing::debug!("{entry_path:?} is a symlink, skipping")
                            } else if file_type.is_dir() {
                                paths.extend(get_all_files(&entry_path).await?.into_iter())
                            } else {
                                paths.push(entry_path);
                            }
                        }
                        Ok(paths)
                    } else {
                        bail!("Path {directory_path:?} is not a directory")
                    }
                } else {
                    Ok(Vec::new())
                }
            })
        }
        Ok(get_all_files(&self.storage_root)
            .await?
            .into_iter()
            .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
        // If there's no trailing slash, we have to start looking from one above: even if
        // `initial_dir` is a directory, we should still list any prefixes in the parent
        // that start with the same string.
        if !full_path.to_string().ends_with('/') {
            initial_dir.pop();
        }
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();
-            if let ListingMode::NoDelimiter = mode {
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
-                let keys = self
+            let keys = self
-                    .list_recursive(prefix)
+                .list_recursive(prefix)
                    .await
                    .map_err(DownloadError::Other)?;
                result.keys = keys
                    .into_iter()
                    .filter(|k| {
                        let path = k.with_base(&self.storage_root);
                        !path.is_dir()
                    })
                    .collect();
                if let Some(max_keys) = max_keys {
                    result.keys.truncate(max_keys.get() as usize);
                }
                return Ok(result);
            }
            let path = match prefix {
                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
                None => Cow::Borrowed(&self.storage_root),
            };
            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
            let keys = keys
                .into_iter()
                .filter(|k| {
                    let path = k.with_base(&self.storage_root);
                    !path.is_dir()
                })
                .collect();
-            // filter out empty directories to mirror s3 behavior.
+            if let ListingMode::NoDelimiter = mode {
-            for prefix in prefixes_to_filter {
+                result.keys = keys;
-                if prefix.is_dir()
+            } else {
-                    && is_directory_empty(&prefix)
+                let mut prefixes = HashSet::new();
-                        .await
+                for key in keys {
-                        .map_err(DownloadError::Other)?
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
-                {
+                    let relative_key = if let Some(prefix) = prefix {
-                    continue;
+                        let mut prefix = prefix.clone();
-                }
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
-
+                        // end up with full file/dir names.
-                let stripped = prefix
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
-                    .strip_prefix(&self.storage_root)
+                        let has_slash = prefix.0.to_string().ends_with('/');
-                    .context("Failed to strip prefix")
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
-                    .and_then(RemotePath::new)
+                            prefix
-                    .expect(
+                        } else {
-                        "We list files for storage root, hence should be able to remote the prefix",
+                            prefix.0.pop();
-                    );
+                            prefix
-
+                        };
-                if prefix.is_dir() {
+
-                    result.prefixes.push(stripped);
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
-                } else {
+                    } else {
-                    result.keys.push(stripped);
+                        key
                    };
                    let relative_key = format!("{}", relative_key);
                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                        let first_part = relative_key
                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
                            .next()
                            .unwrap()
                            .to_owned();
                        prefixes.insert(first_part);
                    } else {
                        result
                            .keys
                            .push(RemotePath::from_string(&relative_key).unwrap());
                    }
                }
                result.prefixes = prefixes
                    .into_iter()
                    .map(|s| RemotePath::from_string(&s).unwrap())
                    .collect();
            }
            if let Some(max_keys) = max_keys {
                result.keys.truncate(max_keys.get() as usize);
            }
            Ok(result)
        };
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }
 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
        if directory_path.exists() {
            if directory_path.is_dir() {
                let mut paths = Vec::new();
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
                    let entry_path =
                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                            anyhow::Error::msg(format!(
                                "non-Unicode path: {}",
                                pb.to_string_lossy()
                            ))
                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                        } else {
                            paths.push(entry_path)
                        }
                    } else {
                        paths.push(entry_path);
                    }
                }
                Ok(paths)
            } else {
                bail!("Path {directory_path:?} is not a directory")
            }
        } else {
            Ok(Vec::new())
        }
    })
 }
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
        let child_sibling =
            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
            listing.keys.into_iter().collect::<HashSet<_>>(),
            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
        );
        // Delimiter: should only go one deep
        let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());
-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(
            listing.keys,
            [RemotePath::from_string("uncle").unwrap()].to_vec()
        );
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("parent").unwrap()].to_vec()
        );
        // Delimiter and prefix without a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-                .to_vec()
+        );
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("grandparent").unwrap()].to_vec()
        );
        Ok(())
    }
    #[tokio::test]
    async fn list_part_component() -> anyhow::Result<()> {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
        // a freeform prefix.
        let _child_a =
            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
        let _child_b =
            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(
                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
                ),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        let mut found_prefixes = listing.prefixes.clone();
        found_prefixes.sort();
        assert_eq!(
            found_prefixes,
            [
                RemotePath::from_string("tenant").unwrap(),
                RemotePath::from_string("tenant-01").unwrap(),
            ]
            .to_vec()
        );
        assert_eq!(listing.keys, [uncle.clone()].to_vec());
        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -27,10 +27,10 @@ use aws_config::{
 };
 use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
+    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,6 +62,7 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
    upload_storage_class: Option<StorageClass>,
    concurrency_limiter: ConcurrencyLimiter,
    // Per-request timeout. Accessible for tests.
    pub timeout: Duration,
@@ -74,13 +75,13 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
-            aws_config.bucket_name
+            remote_storage_config.bucket_name
        );
-        let region = Some(Region::new(aws_config.bucket_region.clone()));
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
@@ -112,6 +113,38 @@ impl S3Bucket {
        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
        let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
            BehaviorVersion::v2023_11_09(),
        )
        .region(region)
        .identity_cache(IdentityCache::lazy().build())
        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
        .sleep_impl(SharedAsyncSleep::from(sleep_impl));
        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
            s.spawn(|| {
                // TODO: make this function async.
                tokio::runtime::Builder::new_current_thread()
                    .enable_all()
                    .build()
                    .unwrap()
                    .block_on(sdk_config_loader.load())
            })
            .join()
            .unwrap()
        });
        let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
        // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
        // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
        if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
            s3_config_builder = s3_config_builder
                .endpoint_url(custom_endpoint)
                .force_path_style(true);
        }
        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -119,41 +152,36 @@ impl S3Bucket {
        retry_config
            .set_max_attempts(Some(1))
            .set_mode(Some(RetryMode::Adaptive));
        s3_config_builder = s3_config_builder.retry_config(retry_config.build());
-        let mut config_builder = Builder::default()
+        let s3_config = s3_config_builder.build();
-            .behavior_version(BehaviorVersion::v2023_11_09())
+        let client = aws_sdk_s3::Client::from_conf(s3_config);
            .region(region)
            .identity_cache(IdentityCache::lazy().build())
            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
            .retry_config(retry_config.build())
            .sleep_impl(SharedAsyncSleep::from(sleep_impl));
-        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
+        let prefix_in_bucket = remote_storage_config
-            config_builder = config_builder
+            .prefix_in_bucket
-                .endpoint_url(custom_endpoint)
+            .as_deref()
-                .force_path_style(true);
+            .map(|prefix| {
-        }
+                let mut prefix = prefix;
                while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    prefix = &prefix[1..]
                }
-        let client = Client::from_conf(config_builder.build());
+                let mut prefix = prefix.to_string();
                while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    prefix.pop();
                }
                prefix
            });
        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                prefix = &prefix[1..]
            }
            let mut prefix = prefix.to_string();
            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                prefix.pop();
            }
            prefix
        });
        Ok(Self {
            client,
-            bucket_name: aws_config.bucket_name.clone(),
+            bucket_name: remote_storage_config.bucket_name.clone(),
-            max_keys_per_list_response: aws_config.max_keys_per_list_response,
+            max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: ConcurrencyLimiter::new(
                remote_storage_config.concurrency_limit.get(),
            ),
            upload_storage_class: remote_storage_config.upload_storage_class.clone(),
            timeout,
        })
    }
@@ -178,10 +206,7 @@ impl S3Bucket {
    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
+        let path_string = path.get_path().as_str();
            .get_path()
            .as_str()
            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -471,16 +496,11 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
+            .or_else(|| {
-            .map(|mut p| {
+                self.prefix_in_bucket.clone().map(|mut s| {
-                // required to end with a separator
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                // otherwise request will return only the entry of a prefix
+                    s
-                if matches!(mode, ListingMode::WithDelimiter)
+                })
                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
        let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +569,15 @@ impl RemoteStorage for S3Bucket {
                }
            }
-            result.prefixes.extend(
+            // S3 gives us prefixes like "foo/", we return them like "foo"
-                prefixes
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                    .iter()
+                Some(
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
+                    self.s3_object_to_relative_path(
-            );
+                        o.prefix()?
                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
                    ),
                )
            }));
            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -586,6 +610,7 @@ impl RemoteStorage for S3Bucket {
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_metadata(metadata.map(|m| m.0))
            .set_storage_class(self.upload_storage_class.clone())
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send();
@@ -637,6 +662,7 @@ impl RemoteStorage for S3Bucket {
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_storage_class(self.upload_storage_class.clone())
            .copy_source(copy_source)
            .send();
@@ -894,6 +920,7 @@ impl RemoteStorage for S3Bucket {
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();
@@ -1050,22 +1077,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path"],
+            vec!["", "some/path", "some/path/"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["/", "/some/path", "/some/path/"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
        ];
@@ -1077,6 +1104,7 @@ mod tests {
                endpoint: None,
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
                upload_storage_class: None,
            };
            let storage =
                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_prefixes(prefix, cancel).await
    }
    async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_files(folder, max_keys, cancel).await
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await
+        .await?
-        .context("client list root prefixes failure")?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );
    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
+        .list(
-        .await
+            Some(&base_prefix.add_trailing_slash()),
-        .context("client list nested prefixes failure")?
+            ListingMode::WithDelimiter,
            None,
            &cancel,
        )
        .await?
        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
 ) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list root files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
    );
    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
            None,
            ListingMode::NoDelimiter,
            Some(NonZeroU32::new(2).unwrap()),
            &cancel,
        )
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);
    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list nested files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
    ctx.client.delete_objects(&[path1, path2], &cancel).await?;
-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
        .client
        .list(None, ListingMode::WithDelimiter, None, &cancel)
        .await?
        .prefixes;
    assert_eq!(prefixes.len(), 1);
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
-    S3Config,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
+        Ok(
-            .await
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
-            .context("list root files failure")?
+                .await
-            .into_iter()
+                .context("list root files failure")?
-            .collect::<HashSet<_>>())
+                .keys
                .into_iter()
                .collect::<HashSet<_>>(),
        )
    }
    let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -381,6 +380,7 @@ fn create_s3_client(
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
            upload_storage_class: None,
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
    pub safekeeper_connstr: Option<String>,
    #[serde(default)]
    pub http_connstr: Option<String>,
    // Minimum of all active RO replicas flush LSN
    #[serde(default = "lsn_invalid")]
    pub standby_horizon: Lsn,
 }
 #[derive(Debug, Clone, Deserialize, Serialize)]
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,6 +9,33 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 /// Declare a failpoint that can use the `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
 #[macro_export]
 macro_rules! pausable_failpoint {
    ($name:literal) => {
        if cfg!(feature = "testing") {
            tokio::task::spawn_blocking({
                let current = tracing::Span::current();
                move || {
                    let _entered = current.entered();
                    tracing::info!("at failpoint {}", $name);
                    fail::fail_point!($name);
                }
            })
            .await
            .expect("spawn_blocking");
        }
    };
    ($name:literal, $cond:expr) => {
        if cfg!(feature = "testing") {
            if $cond {
                pausable_failpoint!($name)
            }
        }
    };
 }
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,6 +34,8 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
    pub const MAX: Self = Self::Valid(u32::MAX);
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -3,7 +3,7 @@
 //!  # Example
 //!
 //!  ```
-//!  # tokio_test::block_on(async {
+//!  # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,11 +2,10 @@
 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::sync::watch::{self, channel};
 use tokio::time::timeout;
 /// An error happened while waiting for a number
@@ -35,23 +34,73 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }
-/// Internal components of a `SeqWait`
+/// Heap of waiters, lowest numbers pop first.
-struct SeqWaitInt<S, V>
+struct Waiters<V>
 where
    S: MonotonicCounter<V>,
    V: Ord,
 {
-    waiters: BinaryHeap<Waiter<V>>,
+    heap: BinaryHeap<Waiter<V>>,
-    current: S,
+    /// Number of the first waiter in the heap, or None if there are no waiters.
-    shutdown: bool,
+    status_channel: watch::Sender<Option<V>>,
 }
 impl<V> Waiters<V>
 where
    V: Ord + Copy,
 {
    fn new() -> Self {
        Waiters {
            heap: BinaryHeap::new(),
            status_channel: channel(None).0,
        }
    }
    /// `status_channel` contains the number of the first waiter in the heap.
    /// This function should be called whenever waiters heap changes.
    fn update_status(&self) {
        let first_waiter = self.heap.peek().map(|w| w.wake_num);
        let _ = self.status_channel.send_replace(first_waiter);
    }
    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
    fn add(&mut self, num: V) -> watch::Receiver<()> {
        let (tx, rx) = channel(());
        self.heap.push(Waiter {
            wake_num: num,
            wake_channel: tx,
        });
        self.update_status();
        rx
    }
    /// Pop all waiters <= num from the heap. Collect channels in a vector,
    /// so that caller can wake them up.
    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
        let mut wake_these = Vec::new();
        while let Some(n) = self.heap.peek() {
            if n.wake_num > num {
                break;
            }
            wake_these.push(self.heap.pop().unwrap().wake_channel);
        }
        self.update_status();
        wake_these
    }
    /// Used on shutdown to efficiently drop all waiters.
    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
        let heap = mem::take(&mut self.heap);
        self.update_status();
        heap
    }
 }
 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,              // wake me when this number arrives ...
+    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: Sender<()>, // ... by sending a message to this channel
+    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
 }
 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -76,6 +125,17 @@ impl<T: Ord> PartialEq for Waiter<T> {
 impl<T: Ord> Eq for Waiter<T> {}
 /// Internal components of a `SeqWait`
 struct SeqWaitInt<S, V>
 where
    S: MonotonicCounter<V>,
    V: Ord,
 {
    waiters: Waiters<V>,
    current: S,
    shutdown: bool,
 }
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -108,7 +168,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: BinaryHeap::new(),
+            waiters: Waiters::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -128,9 +188,8 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;
-            // This will steal the entire waiters map.
+            // Take all waiters to drop them later.
-            // When we drop it all waiters will be woken.
+            internal.waiters.take_all()
            mem::take(&mut internal.waiters)
            // Drop the lock as we exit this scope.
        };
@@ -196,7 +255,7 @@ where
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -205,12 +264,8 @@ where
            return Err(SeqWaitError::Shutdown);
        }
-        // Create a new channel.
+        // Add waiter channel to the queue.
-        let (tx, rx) = channel(());
+        let rx = internal.waiters.add(num);
        internal.waiters.push(Waiter {
            wake_num: num,
            wake_channel: tx,
        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -231,16 +286,8 @@ where
            }
            internal.current.cnt_advance(num);
-            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // Pop all waiters <= num from the heap.
-            // wake them up after releasing the lock.
+            internal.waiters.pop_leq(num)
            let mut wake_these = Vec::new();
            while let Some(n) = internal.waiters.peek() {
                if n.wake_num > num {
                    break;
                }
                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
            }
            wake_these
        };
        for tx in wake_these {
@@ -255,6 +302,23 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
    /// Get a Receiver for the current status.
    ///
    /// The current status is the number of the first waiter in the queue,
    /// or None if there are no waiters.
    ///
    /// This receiver will be notified whenever the status changes.
    /// It is useful for receiving notifications when the first waiter
    /// starts waiting for a number, or when there are no more waiters left.
    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
        self.internal
            .lock()
            .unwrap()
            .waiters
            .status_channel
            .subscribe()
    }
 }
 #[cfg(test)]
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,7 +135,8 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());
-        let nag_after = Duration::from_secs(1);
+        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
        let nag_after = Duration::from_millis(100);
        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -50,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
    }
 }
 extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
        (*api).update_donor(&mut (*donor), donor_lsn)
    }
 }
 extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
@@ -391,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api {
        get_shmem_state: Some(get_shmem_state),
        start_streaming: Some(start_streaming),
        get_flush_rec_ptr: Some(get_flush_rec_ptr),
        update_donor: Some(update_donor),
        get_current_timestamp: Some(get_current_timestamp),
        conn_error_message: Some(conn_error_message),
        conn_status: Some(conn_status),
@@ -421,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api {
    }
 }
 pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
    let empty_feedback = crate::bindings::PageserverFeedback {
        present: false,
        currentClusterSize: 0,
        last_received_lsn: 0,
        disk_consistent_lsn: 0,
        remote_consistent_lsn: 0,
        replytime: 0,
        shard_number: 0,
    };
    crate::bindings::WalproposerShmemState {
        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
        donor_name: [0; 64],
        donor_conninfo: [0; 1024],
        donor_lsn: 0,
        mutex: 0,
        mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
        backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
        shard_ps_feedback: [empty_feedback; 128],
        num_shards: 0,
        min_ps_feedback: empty_feedback,
    }
 }
 impl std::fmt::Display for Level {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{:?}", self)
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,8 +1,5 @@
 use std::ffi::CString;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 use crate::{
    api_bindings::{create_api, take_vec_u8, Level},
    bindings::{
@@ -10,6 +7,8 @@ use crate::{
        WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
    },
 };
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 /// Rust high-level wrapper for C walproposer API. Many methods are not required
 /// for simple cases, hence todo!() in default implementations.
@@ -28,6 +27,10 @@ pub trait ApiImpl {
        todo!()
    }
    fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
        todo!()
    }
    fn get_current_timestamp(&self) -> i64 {
        todo!()
    }
@@ -274,6 +277,7 @@ mod tests {
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
    };
    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;
    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
@@ -297,6 +301,8 @@ mod tests {
        replies_ptr: AtomicUsize,
        // channel to send LSN to the main thread
        sync_channel: std::sync::mpsc::SyncSender<u64>,
        // Shmem state, used for storing donor info
        shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
    }
    impl MockImpl {
@@ -327,11 +333,22 @@ mod tests {
    }
    impl ApiImpl for MockImpl {
        fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
            self.shmem.get()
        }
        fn get_current_timestamp(&self) -> i64 {
            println!("get_current_timestamp");
            0
        }
        fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
            let mut shmem = unsafe { *self.get_shmem_state() };
            shmem.propEpochStartLsn.value = donor_lsn;
            shmem.donor_conninfo = donor.conninfo;
            shmem.donor_lsn = donor_lsn;
        }
        fn conn_status(
            &self,
            _: &mut crate::bindings::Safekeeper,
@@ -479,9 +496,9 @@ mod tests {
                // TODO: When updating Postgres versions, this test will cause
                // problems. Postgres version in message needs updating.
                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -507,6 +524,7 @@ mod tests {
            ],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
        });
        let config = crate::walproposer::Config {
            ttid,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
 twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,7 +1,7 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut updates = layer_map.batch_update();
    for fname in filenames {
        let fname = fname.unwrap();
-        let fname = LayerFileName::from_str(&fname).unwrap();
+        let fname = LayerName::from_str(&fname).unwrap();
        let layer = PersistentLayerDesc::from(fname);
        let lsn_range = layer.get_lsn_range();
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,47 +30,27 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
 //! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
 //! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
 //! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
 //! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
 //! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
 //! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
 //! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
 //! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
 //! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
 //! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
 //! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
 //! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
 //! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
 //! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
 //! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
    config::PageServerConf,
    walrecord::NeonWalRecord,
    walredo::{PostgresRedoManager, ProcessKind},
 };
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+    {
-        {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
-            for nclients in nclients {
+            let mut group = c.benchmark_group("short");
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+            group.bench_with_input(
-                group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
-                    BenchmarkId::from_parameter(nclients),
+                &nclients,
-                    &nclients,
+                |b, nclients| {
-                    |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
-                        let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                        b.iter_custom(|iters| {
+                },
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+            );
                        });
                    },
                );
            }
        }
-
+    }
-        {
+    {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
+        for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+            let mut group = c.benchmark_group("medium");
-                group.bench_with_input(
+            group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
+                BenchmarkId::from_parameter(nclients),
-                    &nclients,
+                &nclients,
-                    |b, nclients| {
+                |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
+                    let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                },
-                        });
+            );
                    },
                );
            }
        }
    }
 }
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
    process_kind: ProcessKind,
    redo_work: Arc<Request>,
    n_redos: u64,
    nclients: u64,
 ) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    conf.walredo_process_kind = process_kind;
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -158,27 +125,13 @@ fn bench_impl(
        });
    }
-    let elapsed = rt.block_on(async move {
+    rt.block_on(async move {
        let mut total_wallclock_time = Duration::ZERO;
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    });
+    })
    // consistency check to ensure process kind setting worked
    if nredos_per_client > 0 {
        assert_eq!(
            manager
                .status()
                .process
                .map(|p| p.kind)
                .expect("the benchmark work causes a walredo process to be spawned"),
            std::borrow::Cow::Borrowed(process_kind.into())
        );
    }
    elapsed
 }
 async fn client(
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,8 +1,12 @@
 use std::collections::HashMap;
 use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
 pub mod util;
@@ -243,6 +247,19 @@ impl Client {
        Ok(())
    }
    pub async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
    ) -> Result<TenantScanRemoteStorageResponse> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
            self.mgmt_api_endpoint
        );
        let response = self.request(Method::GET, &uri, ()).await?;
        let body = response.json().await.map_err(Error::ReceiveBody)?;
        Ok(body)
    }
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
@@ -271,6 +288,34 @@ impl Client {
        Ok((status, progress))
    }
    pub async fn tenant_secondary_status(
        &self,
        tenant_shard_id: TenantShardId,
    ) -> Result<SecondaryProgress> {
        let path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/secondary/status",
            self.mgmt_api_endpoint, tenant_shard_id
        ))
        .expect("Cannot build URL");
        self.request(Method::GET, path, ())
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
        let path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/heatmap_upload",
            self.mgmt_api_endpoint, tenant_id
        ))
        .expect("Cannot build URL");
        self.request(Method::POST, path, ()).await?;
        Ok(())
    }
    pub async fn location_config(
        &self,
        tenant_shard_id: TenantShardId,
@@ -278,10 +323,7 @@ impl Client {
        flush_ms: Option<std::time::Duration>,
        lazy: bool,
    ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest {
+        let req_body = TenantLocationConfigRequest { config };
            tenant_id: Some(tenant_shard_id),
            config,
        };
        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/location_config",
@@ -448,6 +490,18 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }
    pub async fn top_tenant_shards(
        &self,
        request: TopTenantShardsRequest,
    ) -> Result<TopTenantShardsResponse> {
        let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
        self.request(Method::POST, uri, request)
            .await?
            .json()
            .await
            .map_err(Error::ReceiveBody)
    }
    pub async fn layer_map_info(
        &self,
        tenant_shard_id: TenantShardId,
@@ -511,4 +565,57 @@ impl Client {
            }),
        }
    }
    pub async fn ingest_aux_files(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        aux_files: HashMap<String, String>,
    ) -> Result<bool> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
        );
        let resp = self
            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
            .await?;
        match resp.status() {
            StatusCode::OK => Ok(true),
            status => Err(match resp.json::<HttpErrorBody>().await {
                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
                Err(_) => {
                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
                }
            }),
        }
    }
    pub async fn list_aux_files(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        lsn: Lsn,
    ) -> Result<HashMap<String, Bytes>> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
        );
        let resp = self
            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
            .await?;
        match resp.status() {
            StatusCode::OK => {
                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
                })?;
                Ok(resp)
            }
            status => Err(match resp.json::<HttpErrorBody>().await {
                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
                Err(_) => {
                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
                }
            }),
        }
    }
 }
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
    ) -> anyhow::Result<PagestreamClient> {
        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
            .client
-            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
            .await?;
        let Client {
            cancel_on_client_drop,
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,4 +1,5 @@
 use clap::{Parser, Subcommand};
 use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
    let mut executor = MockTimeline::new();
    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
    //let key_range = u64::MIN..u64::MAX;
    println!(
        "starting simulation with key range {:016X}-{:016X}",
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,12 +18,15 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};
 use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::helpers::{
    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
 };
 use crate::interface::*;
 use utils::lsn::Lsn;
@@ -103,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
            ctx,
        )
        .await?;
-        if target_file_size == u64::MAX {
+        if current_level_target_height == u64::MAX {
            // our target height includes all possible lsns
            info!(
                level = current_level_no,
                depth = depth,
                "compaction loop reached max current_level_target_height"
            );
            break;
        }
        current_level_no += 1;
@@ -125,6 +134,7 @@ async fn compact_level<E: CompactionJobExecutor>(
    }
    let mut state = LevelCompactionState {
        shard_identity: *executor.get_shard_identity(),
        target_file_size,
        _lsn_range: lsn_range.clone(),
        layers: layer_fragments,
@@ -164,6 +174,8 @@ struct LevelCompactionState<'a, E>
 where
    E: CompactionJobExecutor,
 {
    shard_identity: ShardIdentity,
    // parameters
    target_file_size: u64,
@@ -366,7 +378,8 @@ where
                .executor
                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                .await?,
-        ) * 8192;
+            &self.shard_identity,
        ) * PAGE_SZ;
        let wal_size = job
            .input_layers
@@ -428,9 +441,9 @@ where
        let mut window = KeyspaceWindow::new(
            E::Key::MIN..E::Key::MAX,
            keyspace,
-            self.target_file_size / 8192,
+            self.target_file_size / PAGE_SZ,
        );
-        while let Some(key_range) = window.choose_next_image() {
+        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
            new_jobs.push(CompactionJob::<E> {
                key_range,
                lsn_range: job.lsn_range.clone(),
@@ -517,8 +530,6 @@ where
        // If we have accumulated only a narrow band of keyspace, create an
        // image layer. Otherwise write a delta layer.
        // FIXME: deal with the case of lots of values for same key
        // FIXME: we are ignoring images here. Did we already divide the work
        // so that we won't encounter them here?
@@ -530,43 +541,101 @@ where
            }
        }
        // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let key_value_stream =
            std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
                .await?
                .map(Result::<_, anyhow::Error>::Ok));
        let mut new_jobs = Vec::new();
        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut key_accum =
            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
        let mut all_in_window: bool = false;
        let mut window = Window::new();
        // Helper function to create a job for a new delta layer with given key-lsn
        // rectangle.
        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
            // The inputs for the job are all the input layers of the original job that
            // overlap with the rectangle.
            let batch_layers: Vec<LayerId> = job
                .input_layers
                .iter()
                .filter(|layer_id| {
                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
                })
                .cloned()
                .collect();
            assert!(!batch_layers.is_empty());
            new_jobs.push(CompactionJob {
                key_range,
                lsn_range: lsn_range.clone(),
                strategy: CompactionStrategy::CreateDelta,
                input_layers: batch_layers,
                completed: false,
            });
        };
        loop {
-            if all_in_window && window.elems.is_empty() {
+            if all_in_window && window.is_empty() {
                // All done!
                break;
            }
            // If we now have enough keyspace for next delta layer in the window, create a
            // new delta layer
            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
            {
-                let batch_layers: Vec<LayerId> = job
+                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
-                    .input_layers
+                continue;
-                    .iter()
+            }
-                    .filter(|layer_id| {
+            assert!(!all_in_window);
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+
-                    })
+            // Process next key in the key space
-                    .cloned()
+            match key_accum.next().await.transpose()? {
-                    .collect();
+                None => {
                assert!(!batch_layers.is_empty());
                new_jobs.push(CompactionJob {
                    key_range,
                    lsn_range: job.lsn_range.clone(),
                    strategy: CompactionStrategy::CreateDelta,
                    input_layers: batch_layers,
                    completed: false,
                });
            } else {
                assert!(!all_in_window);
                if let Some(next_key) = key_accum.next().await.transpose()? {
                    window.feed(next_key.key, next_key.size);
                } else {
                    all_in_window = true;
                }
                Some(next_key) if next_key.partition_lsns.is_empty() => {
                    // Normal case: extend the window by the key
                    window.feed(next_key.key, next_key.size);
                }
                Some(next_key) => {
                    // A key with too large size impact for a single delta layer. This
                    // case occurs if you make a huge number of updates for a single key.
                    //
                    // Drain the window with has_more = false to make a clean cut before
                    // the key, and then make dedicated delta layers for the single key.
                    //
                    // We cannot cluster the key with the others, because we don't want
                    // layer files to overlap with each other in the lsn,key space (no
                    // overlaps for the rectangles).
                    let key = next_key.key;
                    debug!("key {key} with size impact larger than the layer size");
                    while !window.is_empty() {
                        let has_more = false;
                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
                    }
                    // Not really required: but here for future resilience:
                    // We make a "gap" here, so any structure the window holds should
                    // probably be reset.
                    window = Window::new();
                    let mut prior_lsn = job.lsn_range.start;
                    let mut lsn_ranges = Vec::new();
                    for (lsn, _size) in next_key.partition_lsns.iter() {
                        lsn_ranges.push(prior_lsn..*lsn);
                        prior_lsn = *lsn;
                    }
                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
                    for lsn_range in lsn_ranges {
                        let key_range = key..key.next();
                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
                    }
                }
            }
        }
@@ -594,8 +663,8 @@ where
    }
 }
-// Sliding window through keyspace and values
+/// Sliding window through keyspace and values for image layer
-// This is used by over_with_images to decide on good split points
+/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
 struct KeyspaceWindow<K> {
    head: KeyspaceWindowHead<K>,
@@ -623,7 +692,12 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
    }
    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
+    fn advance_until_size(
        &mut self,
        w: &KeyspaceWindowHead<K>,
        max_size: u64,
        shard_identity: &ShardIdentity,
    ) {
        while self.accum_keysize < max_size && !self.reached_end(w) {
            let curr_range = &w.keyspace[self.keyspace_idx];
            if self.end_key < curr_range.start {
@@ -632,7 +706,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            }
            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
+            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
            if (self.accum_keysize + distance as u64) < max_size {
                // oh yeah, it fits
                self.end_key = curr_range.end;
@@ -641,7 +715,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            } else {
                // advance within the range
                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
+                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
                if (self.accum_keysize + distance as u64) < max_size {
                    self.end_key = skip_key;
                    self.accum_keysize += distance as u64;
@@ -677,7 +751,7 @@ where
        }
    }
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
+    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
            // we've reached the end
            return None;
@@ -687,6 +761,7 @@ where
        next_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + self.head.target_keysize,
            shard_identity,
        );
        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -695,6 +770,7 @@ where
        end_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
            shard_identity,
        );
        if end_pos.reached_end(&self.head) {
            // gobble up any unused keyspace between the last used key and end of the range
@@ -728,9 +804,9 @@ struct WindowElement<K> {
    accum_size: u64,
 }
-// Sliding window through keyspace and values
+/// Sliding window through keyspace and values for delta layer tiling
-//
+///
-// This is used to decide what layer to write next, from the beginning of the window.
+/// This is used to decide which delta layer to write next.
 struct Window<K> {
    elems: VecDeque<WindowElement<K>>,
@@ -754,11 +830,13 @@ where
    fn feed(&mut self, key: K, size: u64) {
        let last_size;
        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
+            // We require the keys to be strictly increasing for the window.
-            if key == last.last_key {
+            // Keys should already have been deduplicated by `accum_key_values`
-                last.accum_size += size;
+            assert!(
-                return;
+                last.last_key < key,
-            }
+                "last_key(={}) >= key(={key})",
                last.last_key
            );
            last_size = last.accum_size;
        } else {
            last_size = 0;
@@ -780,6 +858,10 @@ where
        self.elems.front().unwrap().accum_size - self.splitoff_size
    }
    fn is_empty(&self) -> bool {
        self.elems.is_empty()
    }
    fn commit_upto(&mut self, mut upto: usize) {
        while upto > 1 {
            let popped = self.elems.pop_front().unwrap();
@@ -842,7 +924,7 @@ where
        // If we're willing to stretch it up to 1.25 target size, could we
        // gobble up the rest of the work? This avoids creating very small
        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
+        if !has_more && self.remain_size() < target_size * 5 / 4 {
            self.commit_upto(self.elems.len());
        } else {
            let delta_split_at = self.find_size_split(target_size);
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,19 +5,30 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
 use std::fmt::Display;
 use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
 use utils::lsn::Lsn;
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
+pub const PAGE_SZ: u64 = 8192;
 pub fn keyspace_total_size<K>(
    keyspace: &CompactionKeySpace<K>,
    shard_identity: &ShardIdentity,
 ) -> u64
 where
    K: CompactionKey,
 {
-    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
+    keyspace
        .iter()
        .map(|r| K::key_range_size(r, shard_identity) as u64)
        .sum()
 }
 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
@@ -101,17 +112,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
    }
 }
 pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
    layers: &'a [E::DeltaLayer],
    ctx: &'a E::RequestContext,
 ) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
 {
    let mut keys = Vec::new();
    for l in layers {
        // Boxing and casting to LoadFuture is required to obtain the right Sync bound.
        // If we do l.load_keys(ctx).await? directly, there is a compilation error.
        let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
        keys.extend(load_future.await?.into_iter());
    }
    keys.sort_by_key(|k| (k.key(), k.lsn()));
    let stream = futures::stream::iter(keys.into_iter());
    Ok(stream)
 }
 enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
    Unloaded(&'a E::DeltaLayer),
 }
 impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
+    fn min_key(&self) -> E::Key {
        match self {
            Self::Loaded(entries) => entries.front().unwrap().key(),
            Self::Unloaded(dl) => dl.key_range().start,
        }
    }
    fn min_lsn(&self) -> Lsn {
        match self {
            Self::Loaded(entries) => entries.front().unwrap().lsn(),
            Self::Unloaded(dl) => dl.lsn_range().start,
        }
    }
 }
 impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -121,12 +155,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
 impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
+        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
    }
 }
 impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
    fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
+        self.cmp(other) == std::cmp::Ordering::Equal
    }
 }
 impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
@@ -203,11 +237,16 @@ pub struct KeySize<K> {
    pub key: K,
    pub num_values: u64,
    pub size: u64,
    /// The lsns to partition at (if empty then no per-lsn partitioning)
    pub partition_lsns: Vec<(Lsn, u64)>,
 }
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(
    input: I,
    target_size: u64,
 ) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
-    K: Eq,
+    K: Eq + PartialOrd + Display + Copy,
    I: Stream<Item = Result<D, E>>,
    D: CompactionDeltaEntry<'a, K>,
 {
@@ -217,25 +256,39 @@ where
        if let Some(first) = input.next().await {
            let first = first?;
            let mut part_size = first.size();
            let mut accum: KeySize<K> = KeySize {
                key: first.key(),
                num_values: 1,
-                size: first.size(),
+                size: part_size,
                partition_lsns: Vec::new(),
            };
            let mut last_key = accum.key;
            while let Some(this) = input.next().await {
                let this = this?;
                if this.key() == accum.key {
-                    accum.size += this.size();
+                    let add_size = this.size();
                    if part_size + add_size > target_size {
                        accum.partition_lsns.push((this.lsn(), part_size));
                        part_size = 0;
                    }
                    part_size += add_size;
                    accum.size += add_size;
                    accum.num_values += 1;
                } else {
                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
                    last_key = accum.key;
                    yield accum;
                    part_size = this.size();
                    accum = KeySize {
                        key: this.key(),
                        num_values: 1,
-                        size: this.size(),
+                        size: part_size,
                        partition_lsns: Vec::new(),
                    };
                }
            }
            assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
            yield accum;
        }
    }
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,6 +184,12 @@ impl<L> Level<L> {
        }
        let mut events: Vec<Event<K>> = Vec::new();
        for (idx, l) in self.layers.iter().enumerate() {
            let key_range = l.key_range();
            if key_range.end == key_range.start.next() && l.is_delta() {
                // Ignore single-key delta layers as they can be stacked on top of each other
                // as that is the only way to cut further.
                continue;
            }
            events.push(Event {
                key: l.key_range().start,
                layer_idx: idx,
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::key_range_size};
+use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
 use std::ops::Range;
 use utils::lsn::Lsn;
@@ -32,6 +32,8 @@ pub trait CompactionJobExecutor {
    // Functions that the planner uses to support its decisions
    // ----
    fn get_shard_identity(&self) -> &ShardIdentity;
    /// Return all layers that overlap the given bounding box.
    fn get_layers(
        &mut self,
@@ -98,7 +100,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
    ///
    /// This returns u32, for compatibility with Repository::key. If the
    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
+    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
    // return "self + 1"
    fn next(&self) -> Self;
@@ -113,8 +115,8 @@ impl CompactionKey for Key {
    const MIN: Self = Self::MIN;
    const MAX: Self = Self::MAX;
-    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
-        key_range_size(r)
+        ShardedRange::new(r.clone(), shard_identity).page_count()
    }
    fn next(&self) -> Key {
        (self as &Key).next()
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,6 +3,7 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
 use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
@@ -13,6 +14,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;
 use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};
 use crate::interface;
@@ -71,7 +73,7 @@ impl interface::CompactionKey for Key {
    const MIN: Self = u64::MIN;
    const MAX: Self = u64::MAX;
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
+    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
    }
@@ -378,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Delta(this) => this.file_size,
-            MockLayer::Image(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size,
        }
    }
    fn short_id(&self) -> String {
@@ -434,6 +436,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
    type ImageLayer = Arc<MockImageLayer>;
    type RequestContext = MockRequestContext;
    fn get_shard_identity(&self) -> &ShardIdentity {
        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
        &IDENTITY
    }
    async fn get_layers(
        &mut self,
        key_range: &Range<Self::Key>,
@@ -503,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
        let new_layer = Arc::new(MockImageLayer {
            key_range: key_range.clone(),
            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
+            file_size: accum_size * PAGE_SZ,
            deleted: Mutex::new(false),
        });
        info!(
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,23 +1,35 @@
 use once_cell::sync::OnceCell;
 use pageserver_compaction::interface::CompactionLayer;
 use pageserver_compaction::simulator::MockTimeline;
 use utils::logging;
 static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 pub(crate) fn setup_logging() {
    LOG_HANDLE.get_or_init(|| {
        logging::init(
            logging::LogFormat::Test,
            logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
            logging::Output::Stdout,
        )
        .expect("Failed to init test logging")
    });
 }
 /// Test the extreme case that there are so many updates for a single key that
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
 ///
 /// TODO: The code to avoid this problem has not been implemented yet! So the
 /// assertion currently fails, but we need to make it not fail.
 #[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
    setup_logging();
    let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
+    executor.target_file_size = 1_000_000; // 1 MB
-    // Ingest 100 MB of updates to a single key.
+    // Ingest 10 MB of updates to a single key.
    for _ in 1..1000 {
        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
        executor.compact().await.unwrap();
    }
@@ -27,9 +39,32 @@ async fn test_many_updates_for_single_key() {
    }
    for l in executor.live_layers.iter() {
        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
+        // Sanity check that none of the delta layers are empty either.
        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
+            assert!(l.file_size() > 0);
        }
    }
 }
 #[tokio::test]
 async fn test_simple_updates() {
    setup_logging();
    let mut executor = MockTimeline::new();
    executor.target_file_size = 500_000; // 500 KB
    // Ingest some traffic.
    for _ in 1..400 {
        executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
    }
    for l in executor.live_layers.iter() {
        println!("layer {}: {}", l.short_id(), l.file_size());
    }
    println!("Running compaction...");
    executor.compact().await.unwrap();
    for l in executor.live_layers.iter() {
        println!("layer {}: {}", l.short_id(), l.file_size());
    }
 }
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,21 +9,49 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
 //! sources; see the Usage section below for examples.
 //!
 //! # Usage
 //!
 //! ## Producing the SVG
 //!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//! # local timeline dir
-//! $ firefox out.svg
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //!
 //! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
 //! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
 //!
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
 //! # enrich with lines for gc_cutoff and a child branch point
 //! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! ## Viewing
 //! or from pageserver log files.
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
+//! **Inkscape** is better than the built-in viewers in browsers.
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//!
-use anyhow::Result;
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
 //! to see the layer file name in the comment field.
 //!
 //! ```bash
 //!
 //! # Linux
 //! inkscape out.svg
 //!
 //! # macOS
 //! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
 //!
 //! ```
 //!
 use anyhow::{Context, Result};
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -54,6 +82,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    let split: Vec<&str> = name.split("__").collect();
    let keys: Vec<&str> = split[0].split('-').collect();
    let mut lsns: Vec<&str> = split[1].split('-').collect();
    if lsns.last().expect("should").len() == 8 {
        lsns.pop();
    }
    if lsns.len() == 1 {
        lsns.push(lsns[0]);
    }
@@ -63,33 +96,94 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    (keys, lsns)
 }
 #[derive(Clone, Copy)]
 enum LineKind {
    GcCutoff,
    Branch,
 }
 impl From<LineKind> for Fill {
    fn from(value: LineKind) -> Self {
        match value {
            LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
            LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
        }
    }
 }
 impl FromStr for LineKind {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
        Ok(match s {
            "gc_cutoff" => LineKind::GcCutoff,
            "branch" => LineKind::Branch,
            _ => anyhow::bail!("unsupported linekind: {s}"),
        })
    }
 }
 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
        filename: String,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
    }
    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
-    for line in stdin.lock().lines() {
+
    let mut lines: Vec<(Lsn, LineKind)> = vec![];
    for (lineno, line) in stdin.lock().lines().enumerate() {
        let lineno = lineno + 1;
        let line = line.unwrap();
        if let Some((kind, lsn)) = line.split_once(':') {
            let (kind, lsn) = LineKind::from_str(kind)
                .context("parse kind")
                .and_then(|kind| {
                    if lsn.contains('/') {
                        Lsn::from_str(lsn)
                    } else {
                        Lsn::from_hex(lsn)
                    }
                    .map(|lsn| (kind, lsn))
                    .context("parse lsn")
                })
                .with_context(|| format!("parse {line:?} on {lineno}"))?;
            lines.push((lsn, kind));
            continue;
        }
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
+        let (key_range, lsn_range) = parse_filename(filename);
-            // Don't try and parse "metadata" like a key-lsn range
+        files.push(Layer {
-            continue;
+            filename: filename.to_owned(),
-        }
+            key_range,
-        let range = parse_filename(filename);
+            lsn_range,
-        ranges.push(range);
+        });
    }
    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
+    let mut keys: Vec<Key> = Vec::with_capacity(files.len());
-    let mut lsns: Vec<Lsn> = vec![];
+    let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
-    for (keyr, lsnr) in &ranges {
+
    for Layer {
        key_range: keyr,
        lsn_range: lsnr,
        ..
    } in &files
    {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
        lsns.push(lsnr.end);
    }
    lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
    // Analyze
    let key_map = build_coordinate_compression_map(keys);
    let lsn_map = build_coordinate_compression_map(lsns);
@@ -103,11 +197,19 @@ pub fn main() -> Result<()> {
    println!(
        "{}",
        BeginSvg {
-            w: key_map.len() as f32,
+            w: (key_map.len() + 10) as f32,
            h: stretch * lsn_map.len() as f32
        }
    );
-    for (keyr, lsnr) in &ranges {
+
    let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
    for Layer {
        filename,
        key_range: keyr,
        lsn_range: lsnr,
    } in &files
    {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -123,7 +225,6 @@ pub fn main() -> Result<()> {
        let mut lsn_diff = (lsn_end - lsn_start) as f32;
        let mut fill = Fill::None;
        let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
        let mut lsn_offset = 0.0;
        // Fill in and thicken rectangle if it's an
@@ -143,7 +244,7 @@ pub fn main() -> Result<()> {
        println!(
            "    {}",
            rectangle(
-                key_start as f32 + stretch * xmargin,
+                5.0 + key_start as f32 + stretch * xmargin,
                stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                key_diff as f32 - stretch * 2.0 * xmargin,
                stretch * (lsn_diff - 2.0 * ymargin)
@@ -151,8 +252,29 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
            .comment(filename)
        );
    }
    for (lsn, kind) in lines {
        let lsn_start = *lsn_map.get(&lsn).unwrap();
        let lsn_end = lsn_start;
        let stretch = 2.0;
        let lsn_diff = 0.3;
        let lsn_offset = -lsn_diff / 2.0;
        let ymargin = 0.05;
        println!(
            "{}",
            rectangle(
                0.0f32 + stretch * xmargin,
                stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                (key_map.len() + 10) as f32,
                stretch * (lsn_diff - 2.0 * ymargin)
            )
            .fill(kind)
        );
    }
    println!("{}", EndSvg);
    eprintln!("num_images: {}", num_images);
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,8 +2,8 @@ use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
    let summary_blk = block_reader.read_blk(0, ctx).await?;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
    let summary_blk = block_reader.read_blk(0, ctx).await?;
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,6 +219,7 @@ fn handle_metadata(
    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
    println!("Current metadata:\n{meta:?}");
    let mut update_meta = false;
    // TODO: simplify this part
    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
        meta = TimelineMetadata::new(
            *disk_consistent_lsn,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -0,0 +1,98 @@
 use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
 use pageserver_api::shard::TenantShardId;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use std::collections::HashMap;
 use std::sync::Arc;
 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
    page_service_connstring: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    targets: Option<Vec<TenantTimelineId>>,
 }
 pub(crate) fn main(args: Args) -> anyhow::Result<()> {
    let rt = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();
    let main_task = rt.spawn(main_impl(args));
    rt.block_on(main_task).unwrap()
 }
 async fn main_impl(args: Args) -> anyhow::Result<()> {
    let args: &'static Args = Box::leak(Box::new(args));
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        args.mgmt_api_endpoint.clone(),
        args.pageserver_jwt.as_deref(),
    ));
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
        crate::util::cli::targets::Spec {
            limit_to_first_n_targets: None,
            targets: {
                if let Some(targets) = &args.targets {
                    if targets.len() != 1 {
                        anyhow::bail!("must specify exactly one target");
                    }
                    Some(targets.clone())
                } else {
                    None
                }
            },
        },
    )
    .await?;
    let timeline = timelines[0];
    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
    let timeline_id = timeline.timeline_id;
    println!("operating on timeline {}", timeline);
    mgmt_api_client
        .tenant_config(&TenantConfigRequest {
            tenant_id: timeline.tenant_id,
            config: TenantConfig {
                switch_aux_file_policy: Some(AuxFilePolicy::V2),
                ..Default::default()
            },
        })
        .await?;
    for batch in 0..100 {
        let items = (0..100)
            .map(|id| {
                (
                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
                    format!("{:08}", id),
                )
            })
            .collect::<HashMap<_, _>>();
        let file_cnt = items.len();
        mgmt_api_client
            .ingest_aux_files(tenant_shard_id, timeline_id, items)
            .await?;
        println!("ingested {file_cnt} files");
    }
    let files = mgmt_api_client
        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
        .await?;
    println!("{} files found", files.len());
    anyhow::Ok(())
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,8 +312,12 @@ async fn main_impl(
                    let (rel_tag, block_no) =
                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
-                        latest: rng.gen_bool(args.req_latest_probability),
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                        lsn: r.timeline_lsn,
+                            Lsn::MAX
                        } else {
                            r.timeline_lsn
                        },
                        not_modified_since: r.timeline_lsn,
                        rel: rel_tag,
                        blkno: block_no,
                    }
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};
 use std::{f64, sync::Arc};
 use tokio::{
    sync::{mpsc, OwnedSemaphorePermit},
    task::JoinSet,
@@ -12,10 +14,7 @@ use tokio::{
 use std::{
    num::NonZeroUsize,
-    sync::{
+    sync::atomic::{AtomicU64, Ordering},
        atomic::{AtomicU64, Ordering},
        Arc,
    },
    time::{Duration, Instant},
 };
@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
    Ok(())
 }
 #[derive(serde::Serialize)]
 struct Output {
    downloads_count: u64,
    downloads_bytes: u64,
    evictions_count: u64,
    timeline_restarts: u64,
    #[serde(with = "humantime_serde")]
    runtime: Duration,
 }
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions: AtomicU64,
+    evictions_count: AtomicU64,
-    downloads: AtomicU64,
+    downloads_count: AtomicU64,
    downloads_bytes: AtomicU64,
    timeline_restarts: AtomicU64,
 }
 impl LiveStats {
    fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
+        self.evictions_count.fetch_add(1, Ordering::Relaxed);
    }
-    fn download_done(&self) {
+    fn download_done(&self, size: u64) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
+        self.downloads_count.fetch_add(1, Ordering::Relaxed);
        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
    }
    fn timeline_restart_done(&self) {
        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    )
    .await?;
    let token = CancellationToken::new();
    let mut tasks = JoinSet::new();
-    let live_stats = Arc::new(LiveStats::default());
+    let periodic_stats = Arc::new(LiveStats::default());
    let total_stats = Arc::new(LiveStats::default());
    let start = Instant::now();
    tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
+        let periodic_stats = Arc::clone(&periodic_stats);
        let total_stats = Arc::clone(&total_stats);
        let cloned_token = token.clone();
        async move {
            let mut last_at = Instant::now();
            loop {
                if cloned_token.is_cancelled() {
                    return;
                }
                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                let now = Instant::now();
                let delta: Duration = now - last_at;
                last_at = now;
                let LiveStats {
-                    evictions,
+                    evictions_count,
-                    downloads,
+                    downloads_count,
                    downloads_bytes,
                    timeline_restarts,
-                } = &*live_stats;
+                } = &*periodic_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+
                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
            }
        }
    });
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                args,
                Arc::clone(&mgmt_api_client),
                tl,
-                Arc::clone(&live_stats),
+                Arc::clone(&periodic_stats),
                token.clone(),
            ));
        }
    }
    if let Some(runtime) = args.runtime {
        tokio::spawn(async move {
            tokio::time::sleep(runtime.into()).await;
            token.cancel();
        });
    }
    while let Some(res) = tasks.join_next().await {
        res.unwrap();
    }
    let end = Instant::now();
    let duration: Duration = end - start;
    let output = {
        let LiveStats {
            evictions_count,
            downloads_count,
            downloads_bytes,
            timeline_restarts,
        } = &*total_stats;
        Output {
            downloads_count: downloads_count.load(Ordering::Relaxed),
            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
            evictions_count: evictions_count.load(Ordering::Relaxed),
            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
            runtime: duration,
        }
    };
    let output = serde_json::to_string_pretty(&output).unwrap();
    println!("{output}");
    Ok(())
 }
@@ -140,6 +200,7 @@ async fn timeline_actor(
    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
    timeline: TenantTimelineId,
    live_stats: Arc<LiveStats>,
    token: CancellationToken,
 ) {
    // TODO: support sharding
    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -149,7 +210,7 @@ async fn timeline_actor(
        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
        concurrency: Arc<tokio::sync::Semaphore>,
    }
-    loop {
+    while !token.is_cancelled() {
        debug!("restarting timeline");
        let layer_map_info = mgmt_api_client
            .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -185,7 +246,7 @@ async fn timeline_actor(
        live_stats.timeline_restart_done();
-        loop {
+        while !token.is_cancelled() {
            assert!(!timeline.joinset.is_empty());
            if let Some(res) = timeline.joinset.try_join_next() {
                debug!(?res, "a layer actor exited, should not happen");
@@ -255,7 +316,7 @@ async fn layer_actor(
                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                    .await
                    .unwrap();
-                live_stats.download_done();
+                live_stats.download_done(layer.layer_file_size());
                did_it
            }
        };
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,6 +14,7 @@ mod util {
 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
    pub(super) mod aux_files;
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
    pub(super) mod ondemand_download_churn;
@@ -27,6 +28,7 @@ enum Args {
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
    AuxFiles(cmd::aux_files::Args),
 }
 fn main() {
@@ -46,6 +48,7 @@ fn main() {
            cmd::trigger_initial_size_calculation::main(args)
        }
        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
        Args::AuxFiles(args) => cmd::aux_files::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,285 @@
 use std::sync::Arc;
 use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 // BEGIN Copyright (c) 2017 Servo Contributors
 /// Const version of FNV hash.
 #[inline]
 #[must_use]
 pub const fn fnv_hash(bytes: &[u8]) -> u128 {
    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
    const PRIME: u128 = 0x0000000001000000000000000000013B;
    let mut hash = INITIAL_STATE;
    let mut i = 0;
    while i < bytes.len() {
        hash ^= bytes[i] as u128;
        hash = hash.wrapping_mul(PRIME);
        i += 1;
    }
    hash
 }
 // END Copyright (c) 2017 Servo Contributors
 /// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
    let hash = fnv_hash(data).to_be_bytes();
    key[0] = AUX_KEY_PREFIX;
    key[1] = dir_level1;
    key[2] = dir_level2;
    key[3..16].copy_from_slice(&hash[3..16]);
    Key::from_metadata_key_fixed_size(&key)
 }
 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 /// Encode the aux file into a fixed-size key.
 ///
 /// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
 /// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
 /// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
 /// is roughly based on the first two components of the path, one unique number for one component.
 ///
 /// * pg_logical/mappings -> 0x0101
 /// * pg_logical/snapshots -> 0x0102
 /// * pg_logical/replorigin_checkpoint -> 0x0103
 /// * pg_logical/others -> 0x01FF
 /// * pg_replslot/ -> 0x0201
 /// * others -> 0xFFFF
 ///
 /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
 /// The new file type must have never been written to the storage before. Otherwise, there could be data
 /// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
 pub fn encode_aux_file_key(path: &str) -> Key {
    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
    } else if path == "pg_logical/replorigin_checkpoint" {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
        if cfg!(debug_assertions) {
            warn!(
                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
                path
            );
        }
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
    } else {
        if cfg!(debug_assertions) {
            warn!(
                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
                path
            );
        }
        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
    }
 }
 const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
 pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
    let mut ptr = val;
    if ptr.is_empty() {
        // empty value = no files
        return Ok(Vec::new());
    }
    assert_eq!(
        ptr.get_u8(),
        AUX_FILE_ENCODING_VERSION,
        "unsupported aux file value"
    );
    let mut files = vec![];
    while ptr.has_remaining() {
        let key_len = ptr.get_u32() as usize;
        let key = &ptr[..key_len];
        ptr.advance(key_len);
        let val_len = ptr.get_u32() as usize;
        let content = &ptr[..val_len];
        ptr.advance(val_len);
        let path = std::str::from_utf8(key)?;
        files.push((path, content));
    }
    Ok(files)
 }
 /// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
 /// to the original value slice. Be cautious about memory consumption.
 pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
    let mut ptr = val.clone();
    if ptr.is_empty() {
        // empty value = no files
        return Ok(Vec::new());
    }
    assert_eq!(
        ptr.get_u8(),
        AUX_FILE_ENCODING_VERSION,
        "unsupported aux file value"
    );
    let mut files = vec![];
    while ptr.has_remaining() {
        let key_len = ptr.get_u32() as usize;
        let key = ptr.slice(..key_len);
        ptr.advance(key_len);
        let val_len = ptr.get_u32() as usize;
        let content = ptr.slice(..val_len);
        ptr.advance(val_len);
        let path = std::str::from_utf8(&key)?.to_string();
        files.push((path, content));
    }
    Ok(files)
 }
 pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
    if files.is_empty() {
        // no files = empty value
        return Ok(Vec::new());
    }
    let mut encoded = vec![];
    encoded.put_u8(AUX_FILE_ENCODING_VERSION);
    for (path, content) in files {
        if path.len() > u32::MAX as usize {
            anyhow::bail!("{} exceeds path size limit", path);
        }
        encoded.put_u32(path.len() as u32);
        encoded.put_slice(path.as_bytes());
        if content.len() > u32::MAX as usize {
            anyhow::bail!("{} exceeds content size limit", path);
        }
        encoded.put_u32(content.len() as u32);
        encoded.put_slice(content);
    }
    Ok(encoded)
 }
 /// An estimation of the size of aux files.
 pub struct AuxFileSizeEstimator {
    aux_file_size_gauge: IntGauge,
    size: Arc<std::sync::Mutex<Option<isize>>>,
 }
 impl AuxFileSizeEstimator {
    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
        Self {
            aux_file_size_gauge,
            size: Arc::new(std::sync::Mutex::new(None)),
        }
    }
    pub fn on_base_backup(&self, new_size: usize) {
        let mut guard = self.size.lock().unwrap();
        *guard = Some(new_size as isize);
        self.report(new_size as isize);
    }
    pub fn on_add(&self, file_size: usize) {
        let mut guard = self.size.lock().unwrap();
        if let Some(size) = &mut *guard {
            *size += file_size as isize;
            self.report(*size);
        }
    }
    pub fn on_remove(&self, file_size: usize) {
        let mut guard = self.size.lock().unwrap();
        if let Some(size) = &mut *guard {
            *size -= file_size as isize;
            self.report(*size);
        }
    }
    pub fn on_update(&self, old_size: usize, new_size: usize) {
        let mut guard = self.size.lock().unwrap();
        if let Some(size) = &mut *guard {
            *size += new_size as isize - old_size as isize;
            self.report(*size);
        }
    }
    pub fn report(&self, size: isize) {
        self.aux_file_size_gauge.set(size as i64);
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_hash_portable() {
        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
        // if the algorithm produces the same hash across different environments.
        assert_eq!(
            265160408618497461376862998434862070044,
            super::fnv_hash("test1".as_bytes())
        );
        assert_eq!(
            295486155126299629456360817749600553988,
            super::fnv_hash("test/test2".as_bytes())
        );
        assert_eq!(
            144066263297769815596495629667062367629,
            super::fnv_hash("".as_bytes())
        );
    }
    #[test]
    fn test_encoding_portable() {
        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
        // of the page server.
        assert_eq!(
            "62000001017F8B83D94F7081693471ABF91C",
            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
        );
        assert_eq!(
            "62000001027F8E83D94F7081693471ABFCCD",
            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
        );
        assert_eq!(
            "62000001032E07BB014262B821756295C58D",
            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
        );
        assert_eq!(
            "62000001FF4F38E1C74754E7D03C1A660178",
            encode_aux_file_key("pg_logical/unsupported").to_string(),
        );
        assert_eq!(
            "62000002017F8D83D94F7081693471ABFB92",
            encode_aux_file_key("pg_replslot/test3").to_string()
        );
        assert_eq!(
            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
            encode_aux_file_key("other_file_not_supported").to_string(),
        );
    }
    #[test]
    fn test_value_encoding() {
        let files = vec![
            ("pg_logical/1.file", "1111".as_bytes()),
            ("pg_logical/2.file", "2222".as_bytes()),
        ];
        assert_eq!(
            files,
            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
        );
        let files = vec![];
        assert_eq!(
            files,
            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
        );
    }
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,6 +38,14 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;
 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
    #[error("basebackup pageserver error {0:#}")]
    Server(#[from] anyhow::Error),
    #[error("basebackup client error {0:#}")]
    Client(#[source] io::Error),
 }
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -53,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    ctx: &'a RequestContext,
-) -> anyhow::Result<()>
+) -> Result<(), BasebackupError>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -92,8 +100,10 @@ where
    // Consolidate the derived and the provided prev_lsn values
    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) {
+        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
-            ensure!(backup_prev == provided_prev_lsn);
+            return Err(BasebackupError::Server(anyhow!(
                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
            )));
        }
        provided_prev_lsn
    } else {
@@ -159,15 +169,26 @@ where
        }
    }
-    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
        let (kind, segno, _) = key_to_slru_block(*key)?;
        match kind {
            SlruKind::Clog => {
-                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
                    return Err(BasebackupError::Server(anyhow!(
                        "invalid SlruKind::Clog record: block.len()={}",
                        block.len()
                    )));
                }
            }
            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                ensure!(block.len() == BLCKSZ as usize);
+                if block.len() != BLCKSZ as usize {
                    return Err(BasebackupError::Server(anyhow!(
                        "invalid {:?} record: block.len()={}",
                        kind,
                        block.len()
                    )));
                }
            }
        }
@@ -194,12 +215,15 @@ where
        Ok(())
    }
-    async fn flush(&mut self) -> anyhow::Result<()> {
+    async fn flush(&mut self) -> Result<(), BasebackupError> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar.append(&header, self.buf.as_slice()).await?;
+        self.ar
            .append(&header, self.buf.as_slice())
            .await
            .map_err(BasebackupError::Client)?;
        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -209,7 +233,7 @@ where
        Ok(())
    }
-    async fn finish(mut self) -> anyhow::Result<()> {
+    async fn finish(mut self) -> Result<(), BasebackupError> {
        let res = if self.current_segment.is_none() || self.buf.is_empty() {
            Ok(())
        } else {
@@ -226,7 +250,7 @@ impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> anyhow::Result<()> {
+    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
        // TODO include checksum
        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -262,16 +286,25 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await?
+                .await
-                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+                .map_err(|e| BasebackupError::Server(e.into()))?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
                );
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
            for part in slru_partitions.parts {
-                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
+                let blocks = self
                    .timeline
                    .get_vectored(part, self.lsn, self.ctx)
                    .await
                    .map_err(|e| BasebackupError::Server(e.into()))?;
                for (key, block) in blocks {
-                    slru_builder.add_block(&key, block?).await?;
+                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
                    slru_builder.add_block(&key, block).await?;
                }
            }
            slru_builder.finish().await?;
@@ -279,8 +312,11 @@ where
        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in
+        for ((spcnode, dbnode), has_relmap_file) in self
-            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+            .timeline
            .list_dbdirs(self.lsn, self.ctx)
            .await
            .map_err(|e| BasebackupError::Server(e.into()))?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -289,7 +325,8 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
                .map_err(|e| BasebackupError::Server(e.into()))?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -312,7 +349,12 @@ where
                }
            }
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+            for (path, content) in self
                .timeline
                .list_aux_files(self.lsn, self.ctx)
                .await
                .map_err(|e| BasebackupError::Server(e.into()))?
            {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
@@ -343,34 +385,41 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await?
+            .await
            .map_err(|e| BasebackupError::Server(e.into()))?
        {
            self.add_twophase_file(xid).await?;
        }
        fail_point!("basebackup-before-control-file", |_| {
-            bail!("failpoint basebackup-before-control-file")
+            Err(BasebackupError::Server(anyhow!(
                "failpoint basebackup-before-control-file"
            )))
        });
        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file().await?;
-        self.ar.finish().await?;
+        self.ar.finish().await.map_err(BasebackupError::Client)?;
        debug!("all tarred up!");
        Ok(())
    }
    /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await?;
+            .await
            .map_err(|e| BasebackupError::Server(e.into()))?;
        // If the relation is empty, create an empty file
        if nblocks == 0 {
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
                .append(&header, &mut io::empty())
                .await
                .map_err(BasebackupError::Client)?;
            return Ok(());
        }
@@ -384,14 +433,18 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await?;
+                    .await
                    .map_err(|e| BasebackupError::Server(e.into()))?;
                segment_data.extend_from_slice(&img[..]);
            }
            let file_name = dst.to_segfile_name(seg as u32);
            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar.append(&header, segment_data.as_slice()).await?;
+            self.ar
                .append(&header, segment_data.as_slice())
                .await
                .map_err(BasebackupError::Client)?;
            seg += 1;
            startblk = endblk;
@@ -411,20 +464,22 @@ where
        spcnode: u32,
        dbnode: u32,
        has_relmap_file: bool,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), BasebackupError> {
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await?;
+                .await
                .map_err(|e| BasebackupError::Server(e.into()))?;
-            ensure!(
+            if img.len()
-                img.len()
+                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
-                    == dispatch_pgversion!(
+            {
-                        self.timeline.pg_version,
+                return Err(BasebackupError::Server(anyhow!(
-                        pgv::bindings::SIZEOF_RELMAPFILE
+                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
-                    )
+                    img.len(),
-            );
+                )));
            }
            Some(img)
        } else {
@@ -437,14 +492,20 @@ where
                ver => format!("{ver}\x0A"),
            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar.append(&header, pg_version_str.as_bytes()).await?;
+            self.ar
                .append(&header, pg_version_str.as_bytes())
                .await
                .map_err(BasebackupError::Client)?;
            info!("timeline.pg_version {}", self.timeline.pg_version);
            if let Some(img) = relmap_img {
                // filenode map for global tablespace
                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
                    .append(&header, &img[..])
                    .await
                    .map_err(BasebackupError::Client)?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -463,18 +524,26 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await?
+                    .await
                    .map_err(|e| BasebackupError::Server(e.into()))?
                    .is_empty()
            {
                return Ok(());
            }
            // User defined tablespaces are not supported
-            ensure!(spcnode == DEFAULTTABLESPACE_OID);
+            if spcnode != DEFAULTTABLESPACE_OID {
                return Err(BasebackupError::Server(anyhow!(
                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
                )));
            }
            // Append dir path for each database
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
-            self.ar.append(&header, &mut io::empty()).await?;
+            self.ar
                .append(&header, &mut io::empty())
                .await
                .map_err(BasebackupError::Client)?;
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -484,11 +553,17 @@ where
                    ver => format!("{ver}\x0A"),
                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar.append(&header, pg_version_str.as_bytes()).await?;
+                self.ar
                    .append(&header, pg_version_str.as_bytes())
                    .await
                    .map_err(BasebackupError::Client)?;
                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar.append(&header, &img[..]).await?;
+                self.ar
                    .append(&header, &img[..])
                    .await
                    .map_err(BasebackupError::Client)?;
            }
        };
        Ok(())
@@ -497,11 +572,12 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await?;
+            .await
            .map_err(|e| BasebackupError::Server(e.into()))?;
        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -509,7 +585,10 @@ where
        buf.put_u32_le(crc);
        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar.append(&header, &buf[..]).await?;
+        self.ar
            .append(&header, &buf[..])
            .await
            .map_err(BasebackupError::Client)?;
        Ok(())
    }
@@ -518,24 +597,28 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")?;
+                write!(zenith_signal, "PREV LSN: none")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
+                write!(zenith_signal, "PREV LSN: invalid")
                    .map_err(|e| BasebackupError::Server(e.into()))?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
                .map_err(|e| BasebackupError::Server(e.into()))?;
        }
        self.ar
            .append(
                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                zenith_signal.as_bytes(),
            )
-            .await?;
+            .await
            .map_err(BasebackupError::Client)?;
        let checkpoint_bytes = self
            .timeline
@@ -557,7 +640,10 @@ where
        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar.append(&header, &pg_control_bytes[..]).await?;
+        self.ar
            .append(&header, &pg_control_bytes[..])
            .await
            .map_err(BasebackupError::Client)?;
        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -572,8 +658,16 @@ where
            self.lsn,
        )
        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
+        if wal_seg.len() != WAL_SEGMENT_SIZE {
-        self.ar.append(&header, &wal_seg[..]).await?;
+            return Err(BasebackupError::Server(anyhow!(
                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
                wal_seg.len()
            )));
        }
        self.ar
            .append(&header, &wal_seg[..])
            .await
            .map_err(BasebackupError::Client)?;
        Ok(())
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -3,6 +3,7 @@
 //! Main entry point for the Page Server executable.
 use std::env::{var, VarError};
 use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};
@@ -121,8 +122,10 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );
-    // after setting up logging, log the effective IO engine choice
+    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -149,37 +152,34 @@ fn initialize_config(
    workdir: &Utf8Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
    let init = arg_matches.get_flag("init");
    let update_config = init || arg_matches.get_flag("update-config");
-    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
+    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
-        if init {
+        Ok(mut f) => {
-            anyhow::bail!(
+            if init {
-                "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
+                anyhow::bail!("config file already exists: {cfg_file_path}");
-            );
+            }
            let md = f.metadata().context("stat config file")?;
            if md.is_file() {
                let mut s = String::new();
                f.read_to_string(&mut s).context("read config file")?;
                Some(s.parse().context("parse config file toml")?)
            } else {
                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
            }
        }
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
        Err(e) => {
            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
        }
        // Supplement the CLI arguments with the config file
        let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
            .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
        (
            cfg_file_contents
                .parse::<toml_edit::Document>()
                .with_context(|| {
                    format!("Failed to parse '{cfg_file_path}' as pageserver config")
                })?,
            true,
        )
    } else if cfg_file_path.exists() {
        anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
    } else {
        // We're initializing the tenant, so there's no config file yet
        (
            DEFAULT_CONFIG_FILE
                .parse::<toml_edit::Document>()
                .context("could not parse built-in config file")?,
            false,
        )
    };
    let mut effective_config = file_contents.unwrap_or_else(|| {
        DEFAULT_CONFIG_FILE
            .parse()
            .expect("unit tests ensure this works")
    });
    // Patch with overrides from the command line
    if let Some(values) = arg_matches.get_many::<String>("config-override") {
        for option_line in values {
            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -187,22 +187,21 @@ fn initialize_config(
            })?;
            for (key, item) in doc.iter() {
-                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
+                effective_config.insert(key, item.clone());
                    anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
                }
                toml.insert(key, item.clone());
            }
        }
    }
-    debug!("Resulting toml: {toml}");
+    debug!("Resulting toml: {effective_config}");
-    let conf = PageServerConf::parse_and_validate(&toml, workdir)
+
    // Construct the runtime representation
    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
        .context("Failed to parse pageserver configuration")?;
-    if update_config {
+    if init {
        info!("Writing pageserver config to '{cfg_file_path}'");
-        std::fs::write(cfg_file_path, toml.to_string())
+        std::fs::write(cfg_file_path, effective_config.to_string())
            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
        info!("Config successfully written to '{cfg_file_path}'")
    }
@@ -285,7 +284,6 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -517,16 +515,12 @@ fn start_pageserver(
        }
    });
-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+    let secondary_controller = secondary::spawn_tasks(
-        secondary::spawn_tasks(
+        tenant_manager.clone(),
-            tenant_manager.clone(),
+        remote_storage.clone(),
-            remote_storage.clone(),
+        background_jobs_barrier.clone(),
-            background_jobs_barrier.clone(),
+        shutdown_pageserver.clone(),
-            shutdown_pageserver.clone(),
+    );
        )
    } else {
        secondary::null_controller()
    };
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -534,15 +528,13 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
-    if let Some(remote_storage) = &remote_storage {
+    launch_disk_usage_global_eviction_task(
-        launch_disk_usage_global_eviction_task(
+        conf,
-            conf,
+        remote_storage.clone(),
-            remote_storage.clone(),
+        disk_usage_eviction_state.clone(),
-            disk_usage_eviction_state.clone(),
+        tenant_manager.clone(),
-            tenant_manager.clone(),
+        background_jobs_barrier.clone(),
-            background_jobs_barrier.clone(),
+    )?;
        )?;
    }
    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
@@ -655,17 +647,20 @@ fn start_pageserver(
            None,
            "libpq endpoint listener",
            true,
-            async move {
+            {
-                page_service::libpq_listener_main(
+                let tenant_manager = tenant_manager.clone();
-                    conf,
+                async move {
-                    broker_client,
+                    page_service::libpq_listener_main(
-                    pg_auth,
+                        tenant_manager,
-                    pageserver_listener,
+                        broker_client,
-                    conf.pg_auth_type,
+                        pg_auth,
-                    libpq_ctx,
+                        pageserver_listener,
-                    task_mgr::shutdown_token(),
+                        conf.pg_auth_type,
-                )
+                        libpq_ctx,
-                .await
+                        task_mgr::shutdown_token(),
                    )
                    .await
                }
            },
        );
    }
@@ -694,14 +689,7 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
            let bg_deletion_queue = deletion_queue.clone();
            pageserver::shutdown_pageserver(
                &tenant_manager,
                bg_remote_storage.map(|_| bg_deletion_queue),
                0,
            )
            .await;
            unreachable!()
        })
    }
@@ -709,12 +697,11 @@ fn start_pageserver(
 fn create_remote_storage_client(
    conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
+) -> anyhow::Result<GenericRemoteStorage> {
    let config = if let Some(config) = &conf.remote_storage_config {
        config
    } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
+        anyhow::bail!("no remote storage configured, this is a deprecated configuration");
        return Ok(None);
    };
    // Create the client
@@ -734,7 +721,7 @@ fn create_remote_storage_client(
            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
    }
-    Ok(Some(remote_storage))
+    Ok(remote_storage)
 }
 fn cli() -> Command {
@@ -756,18 +743,13 @@ fn cli() -> Command {
        // See `settings.md` for more details on the extra configuration patameters pageserver can process
        .arg(
            Arg::new("config-override")
                .long("config-override")
                .short('c')
                .num_args(1)
                .action(ArgAction::Append)
                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
        .arg(
            Arg::new("update-config")
                .long("update-config")
                .action(ArgAction::SetTrue)
                .help("Update the config file when started"),
        )
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::{collections::HashMap, env};
+use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -30,9 +30,9 @@ use utils::{
    logging::LogFormat,
 };
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -51,7 +51,7 @@ pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;
-    pub use pageserver_api::{
+    pub use pageserver_api::config::{
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
@@ -91,13 +91,15 @@ pub mod defaults {
    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
    pub const DEFAULT_GET_IMPL: &str = "legacy";
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
    ///
    /// Default built-in configuration file.
@@ -138,6 +140,8 @@ pub mod defaults {
 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
 #get_impl = '{DEFAULT_GET_IMPL}'
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -284,6 +288,8 @@ pub struct PageServerConf {
    pub get_vectored_impl: GetVectoredImpl,
    pub get_impl: GetImpl,
    pub max_vectored_read_bytes: MaxVectoredReadBytes,
    pub validate_vectored_get: bool,
@@ -329,26 +335,6 @@ impl<T: Clone> BuilderValue<T> {
    }
 }
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
 // plane and/or storage controller.
 //
 #[derive(serde::Deserialize)]
 pub(crate) struct NodeMetadata {
    #[serde(rename = "host")]
    pub(crate) postgres_host: String,
    #[serde(rename = "port")]
    pub(crate) postgres_port: u16,
    pub(crate) http_host: String,
    pub(crate) http_port: u16,
    // Deployment tools may write fields to the metadata file beyond what we
    // use in this type: this type intentionally only names fields that require.
    #[serde(flatten)]
    pub(crate) other: HashMap<String, serde_json::Value>,
 }
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
@@ -414,6 +400,8 @@ struct PageServerConfigBuilder {
    get_vectored_impl: BuilderValue<GetVectoredImpl>,
    get_impl: BuilderValue<GetImpl>,
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
    validate_vectored_get: BuilderValue<bool>,
@@ -503,6 +491,7 @@ impl PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
@@ -681,6 +670,10 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }
    pub fn get_impl(&mut self, value: GetImpl) {
        self.get_impl = BuilderValue::Set(value);
    }
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }
@@ -750,6 +743,7 @@ impl PageServerConfigBuilder {
                secondary_download_concurrency,
                ingest_batch_size,
                get_vectored_impl,
                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
@@ -1035,6 +1029,9 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
                "get_impl" => {
                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
@@ -1126,6 +1123,7 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
@@ -1365,6 +1363,7 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
@@ -1438,6 +1437,7 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
@@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
                        upload_storage_class: None,
                    }),
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -14,10 +14,8 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
-use crate::{
+use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
-    config::{NodeMetadata, PageServerConf},
+use pageserver_api::config::NodeMetadata;
    virtual_file::on_fatal_io_error,
 };
 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -65,7 +63,7 @@ impl ControlPlaneClient {
        let mut client = reqwest::ClientBuilder::new();
        if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = hyper::HeaderMap::new();
+            let mut headers = reqwest::header::HeaderMap::new();
            headers.insert(
                "Authorization",
                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
 use list_writer::ListWriterQueueMessage;
 use validator::ValidatorQueueMessage;
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
 // TODO: configurable for how long to wait before executing deletions
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
@@ -632,7 +632,7 @@ impl DeletionQueue {
    ///
    /// If remote_storage is None, then the returned workers will also be None.
    pub fn new<C>(
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
        control_plane_client: Option<C>,
        conf: &'static PageServerConf,
    ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,23 +658,6 @@ impl DeletionQueue {
        // longer to flush after Tenants have all been torn down.
        let cancel = CancellationToken::new();
        let remote_storage = match remote_storage {
            None => {
                return (
                    Self {
                        client: DeletionQueueClient {
                            tx,
                            executor_tx,
                            lsn_table: lsn_table.clone(),
                        },
                        cancel,
                    },
                    None,
                )
            }
            Some(r) => r,
        };
        (
            Self {
                client: DeletionQueueClient {
@@ -734,20 +717,20 @@ mod test {
    use crate::{
        control_plane_client::RetryForeverError,
        repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
    };
    use super::*;
    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));
-    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
        lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
    });
    // When you need a second layer in a test.
-    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
        lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
    });
@@ -765,7 +748,7 @@ mod test {
        /// Simulate a pageserver restart by destroying and recreating the deletion queue
        async fn restart(&mut self) {
            let (deletion_queue, workers) = DeletionQueue::new(
-                Some(self.storage.clone()),
+                self.storage.clone(),
                Some(self.mock_control_plane.clone()),
                self.harness.conf,
            );
@@ -797,7 +780,7 @@ mod test {
        /// Returns remote layer file name, suitable for use in assert_remote_files
        fn write_remote_layer(
            &self,
-            file_name: LayerFileName,
+            file_name: LayerName,
            gen: Generation,
        ) -> anyhow::Result<String> {
            let tenant_shard_id = self.harness.tenant_shard_id;
@@ -875,7 +858,7 @@ mod test {
        let mock_control_plane = MockControlPlane::new();
        let (deletion_queue, worker) = DeletionQueue::new(
-            Some(storage.clone()),
+            storage.clone(),
            Some(mock_control_plane.clone()),
            harness.conf,
        );
@@ -952,7 +935,7 @@ mod test {
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_shard_id = ctx.harness.tenant_shard_id;
        let content: Vec<u8> = "victim1 contents".into();
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
    // have a config object handy to project it to a remote key, and need the consuming worker
    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
    pub(super) objects: Vec<RemotePath>,
    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
    },
 };
@@ -534,13 +534,12 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
+                    let file_size = layer.metadata.file_size;
                    let tenant_manager = tenant_manager.clone();
                    js.spawn(async move {
                        layer
                            .secondary_tenant
-                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
+                            .evict_layer(layer.timeline_id, layer.name)
                            .await;
                        Ok(file_size)
                    });
@@ -599,7 +598,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 pub(crate) struct EvictionSecondaryLayer {
    pub(crate) secondary_tenant: Arc<SecondaryTenant>,
    pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerFileName,
+    pub(crate) name: LayerName,
    pub(crate) metadata: LayerFileMetadata,
 }
@@ -632,9 +631,9 @@ impl EvictionLayer {
        }
    }
-    pub(crate) fn get_name(&self) -> LayerFileName {
+    pub(crate) fn get_name(&self) -> LayerName {
        match self {
-            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Attached(l) => l.layer_desc().layer_name(),
            Self::Secondary(sl) => sl.name.clone(),
        }
    }
@@ -642,7 +641,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
+            Self::Secondary(sl) => sl.metadata.file_size,
        }
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,6 +257,37 @@ paths:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"
  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
    parameters:
      - name: tenant_shard_id
        in: path
        required: true
        schema:
          type: string
      - name: timeline_id
        in: path
        required: true
        schema:
          type: string
          format: hex
    post:
      description: Obtain lease for the given LSN
      parameters:
        - name: lsn
          in: query
          required: true
          schema:
            type: string
            format: hex
          description: A LSN to obtain the lease for
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/LsnLease"
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -420,25 +451,6 @@ paths:
          description: Tenant scheduled to load successfully
  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
      - name: tenant_id
        in: path
        required: true
        schema:
          type: string
    get:
      description: |
        Calculate tenant's synthetic size
      responses:
        "200":
          description: Tenant's synthetic size
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
  # This route has no handler. TODO: remove?
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
        in: path
@@ -468,19 +480,9 @@ paths:
          content:
            application/json:
              schema:
-                type: object
+                $ref: "#/components/schemas/SyntheticSizeResponse"
-                required:
+            text/html:
-                  - id
+              description: SVG representation of the tenant and it's timelines.
                  - size
                properties:
                  id:
                    type: string
                    format: hex
                  size:
                    type: integer
                    nullable: true
                    description: |
                      Size metric in bytes or null if inputs_only=true was given.
        "401":
          description: Unauthorized Error
          content:
@@ -610,6 +612,80 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
    parameters:
      - name: tenant_shard_id
        in: path
        required: true
        schema:
          type: string
      - name: timeline_id
        in: path
        ŕequired: true
        schema:
          type: string
    put:
      description: |
        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
        Current implementation might not be retryable across failure cases, but will be enhanced in future.
        Detaching should be expected to be expensive operation. Timeouts should be retried.
      responses:
        "200":
          description: |
            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
            If any timelines were deleted after reparenting, they might not be on this list.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/AncestorDetached"
        "400":
          description: |
            Number of early checks meaning the timeline cannot be detached now:
              - the ancestor of timeline has an ancestor: not supported, see RFC
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "404":
          description: Tenant or timeline not found.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
          description: |
            The timeline can never be detached:
              - timeline has no ancestor, implying that the timeline has never had an ancestor
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "500":
          description: |
            Transient error, for example, pageserver shutdown happened while
            processing the request but we were unable to distinguish that. Must
            be retried.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "503":
          description: |
            Temporarily unavailable, please retry. Possible reasons:
              - another timeline detach for the same tenant is underway, please retry later
              - detected shutdown error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -782,9 +858,6 @@ components:
      required:
        - mode
      properties:
        tenant_id:
          type: string
          description: Not used, scheduled for removal.
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -932,6 +1005,9 @@ components:
          format: hex
        size:
          type: integer
          nullable: true
          description: |
            Size metric in bytes or null if inputs_only=true was given.
        segment_sizes:
          type: array
          items:
@@ -1009,6 +1085,15 @@ components:
          type: string
          enum: [past, present, future, nodata]
    LsnLease:
      type: object
      required:
        - valid_until
      properties:
        valid_until:
          type: string
          format: date-time
    PageserverUtilization:
      type: object
      required:
@@ -1066,6 +1151,19 @@ components:
          format: int64
          description: How many bytes of layer content were in the latest layer heatmap
    AncestorDetached:
      type: object
      required:
        - reparented_timelines
      properties:
        reparented_timelines:
          type: array
          description: Set of reparented timeline ids
          properties:
            type: string
            format: hex
            description: TimelineId
    Error:
      type: object
--- a/Show More
+++ b/Show More
`@@ -1,2 +1,2 @@`
	`[profile.default]`	`[profile.default]`
	`slow-timeout = { period = "20s", terminate-after = 3 }`	`slow-timeout = { period = "60s", terminate-after = 3 }`
`@@ -1,4 +1,6 @@`
	`[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)`	`[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)`



	`# Neon`	`# Neon`