Add relsize cache stress test

I wrote this as a regression test for the bug fixed in #8807, but seems like a useful test in general. Without the fix from PR #8807, this readily fails with an assertion failure or other errors. XXX: This is failing on 'main', even with the fix from #8807, just not as quickly. This uses a small neon.relsize_hash_size=100 setting; when I bump it up to 1000 or more, it doesn't fail anymore. But this suggests that it's still possible to "overwhelm" the relsize cache
Add dump_relsize_cache() function
2026-03-15 06:10:36 +00:00 · 2024-08-23 14:38:05 +03:00 · 2024-08-23 14:37:57 +03:00
339 changed files with 6849 additions and 12730 deletions
--- a/.devcontainer/Dockerfile.devcontainer
+++ b/.devcontainer/Dockerfile.devcontainer
@@ -1 +0,0 @@
-FROM neondatabase/build-tools:pinned
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,23 +0,0 @@
-// https://containers.dev/implementors/json_reference/
-{
-  "name": "Neon",
-  "build": {
-    "context": "..",
-    "dockerfile": "Dockerfile.devcontainer"
-  },
-
-  "postCreateCommand": {
-    "build neon": "BUILD_TYPE=debug CARGO_BUILD_FLAGS='--features=testing' mold -run make -s -j`nproc`",
-    "install python deps": "./scripts/pysync"
-  },
-
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "charliermarsh.ruff",
-        "github.vscode-github-actions",
-        "rust-lang.rust-analyzer"
-      ]
-    }
-  }
-}
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,6 +0,0 @@
-
-blank_issues_enabled: true
-contact_links:
-  - name: Feature request
-    url: https://console.neon.tech/app/projects?modal=feedback
-    about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,13 +7,6 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
-  - AZURE_DEV_CLIENT_ID
-  - AZURE_DEV_REGISTRY_NAME
-  - AZURE_DEV_SUBSCRIPTION_ID
-  - AZURE_PROD_CLIENT_ID
-  - AZURE_PROD_REGISTRY_NAME
-  - AZURE_PROD_SUBSCRIPTION_ID
-  - AZURE_TENANT_ID
  - BENCHMARK_PROJECT_ID_PUB
  - BENCHMARK_PROJECT_ID_SUB
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,7 +71,7 @@ runs:
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
        # The lack of compatibility snapshot (for example, for the new Postgres version)
@@ -211,13 +211,13 @@ runs:
        fi

    - name: Upload compatibility snapshot
-      # Note, that we use `github.base_ref` which is a target branch for a PR
-      if: github.event_name == 'pull_request' && github.base_ref == 'release'
+      if: github.ref_name == 'release'
      uses: ./.github/actions/upload
      with:
-        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
+        prefix: latest

    - name: Upload test results
      if: ${{ !cancelled() }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -216,14 +216,8 @@ jobs:
          #nextest does not yet support running doctests
          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

-          # run all non-pageserver tests
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
-
-          # run pageserver tests with different settings
          for io_engine in std-fs tokio-epoll-uring ; do
-            for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-            done
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
          done

          # Run separate tests for real S3
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -1,56 +0,0 @@
-name: Push images to ACR
-on:
-  workflow_call:
-    inputs:
-      client_id:
-        description: Client ID of Azure managed identity or Entra app
-        required: true
-        type: string
-      image_tag:
-        description: Tag for the container image
-        required: true
-        type: string
-      images:
-        description: Images to push
-        required: true
-        type: string
-      registry_name:
-        description: Name of the container registry
-        required: true
-        type: string
-      subscription_id:
-        description: Azure subscription ID
-        required: true
-        type: string
-      tenant_id:
-        description: Azure tenant ID
-        required: true
-        type: string
-
-jobs:
-  push-to-acr:
-    runs-on: ubuntu-22.04
-    permissions:
-      contents: read  # This is required for actions/checkout
-      id-token: write # This is required for Azure Login to work.
-
-    steps:
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ inputs.client_id }}
-          subscription-id: ${{ inputs.subscription_id }}
-          tenant-id: ${{ inputs.tenant_id }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=${{ inputs.registry_name }}
-
-      - name: Copy docker images to ACR ${{ inputs.registry_name }}
-        run: |
-          images='${{ inputs.images }}'
-          for image in ${images}; do
-            docker buildx imagetools create \
-              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                        neondatabase/${image}:${{ inputs.image_tag }}
-          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -286,7 +286,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          SYNC_AFTER_EACH_TEST: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -794,6 +793,9 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    permissions:
+      contents: read  # This is required for actions/checkout
+      id-token: write # This is required for Azure Login to work.
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

@@ -820,6 +822,28 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

+      - name: Azure login
+        if: github.ref_name == 'main'
+        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
+        with:
+          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+
+      - name: Login to ACR
+        if: github.ref_name == 'main'
+        run: |
+          az acr login --name=neoneastus2
+
+      - name: Copy docker images to ACR-dev
+        if: github.ref_name == 'main'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create \
+              -t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
+                                        neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
+          done
+
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
@@ -857,30 +881,6 @@ jobs:
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done

-  push-to-acr-dev:
-    if: github.ref_name == 'main'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
-  push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-    needs: [ tag, promote-images ]
-    uses: ./.github/workflows/_push-to-acr.yml
-    with:
-      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
-      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
-      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
-      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
-      tenant_id: ${{ vars.AZURE_TENANT_ID }}
-
  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
    runs-on: ubuntu-22.04
@@ -956,8 +956,8 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1055,88 +1055,43 @@ jobs:
              generate_release_notes: true,
            })

-  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
-    needs: [ deploy ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
    if: github.ref_name == 'release'

-    runs-on: ubuntu-22.04
+    runs-on: [ self-hosted, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
-      - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
-        id: fetch-last-release-pr-info
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          branch_name_and_pr_number=$(gh pr list \
-            --repo "${GITHUB_REPOSITORY}" \
-            --base release \
-            --state merged \
-            --limit 10 \
-            --json mergeCommit,headRefName,number \
-            --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
-          branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
-          pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
-
-          run_id=$(gh run list \
-            --repo "${GITHUB_REPOSITORY}" \
-            --workflow build_and_test.yml \
-            --branch "${branch_name}" \
-            --json databaseId \
-            --limit 1 \
-            --jq '.[].databaseId')
-
-          last_commit_sha=$(gh pr view "${pr_number}" \
-            --repo "${GITHUB_REPOSITORY}" \
-            --json commits \
-            --jq '.commits[-1].oid')
-
-          echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
-          echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
-
-      - name: Promote compatibility snapshot and Neon artifact
+      - name: Promote compatibility snapshot for the release
        env:
          BUCKET: neon-github-public-dev
-          AWS_REGION: eu-central-1
-          COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
-          RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
+          PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
-          new_prefix="artifacts/latest"
-
-          files_to_promote=()
-          files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
-
-          for arch in X64 ARM64; do
+          # Update compatibility snapshot for the release
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
-              neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
-              s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
-              if [ -z "${s3_key}" ]; then
-                echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
-                exit 1
-              fi
+              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
+              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst

-              files_to_promote+=("s3://${BUCKET}/${s3_key}")
-
-              for pg_version in v14 v15 v16; do
-                # We run less tests for debug builds, so we don't need to promote them
-                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
-                  continue
-                fi
-
-                compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
-                s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
-                if [ -z "${s3_key}" ]; then
-                  echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
-                  exit 1
-                fi
-
-                files_to_promote+=("s3://${BUCKET}/${s3_key}")
-              done
+              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
            done
          done

-          for f in "${files_to_promote[@]}"; do
-            time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
+          # Update Neon artifact for the release (reuse already uploaded artifact)
+          for build_type in debug release; do
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done

  pin-build-tools-image:
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -7,11 +7,6 @@ on:
  pull_request_target:
    types:
      - opened
-  workflow_dispatch:
-    inputs:
-      github-actor:
-        description: 'GitHub username. If empty, the username of the current user will be used'
-        required: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -31,31 +26,12 @@ jobs:
      id: check-user
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-        ACTOR: ${{ inputs.github-actor || github.actor }}
      run: |
-        expected_error="User does not exist or is not a member of the organization"
-        output_file=output.txt
-
-        for i in $(seq 1 10); do
-          if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
-
-            is_member=true
-            break
-          elif grep -q "${expected_error}" ${output_file}; then
-            is_member=false
-            break
-          elif [ $i -eq 10 ]; then
-            title="Failed to get memmbership status for ${ACTOR}"
-            message="The latest GitHub API error message: '$(cat ${output_file})'"
-            echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
-
-            exit 1
-          fi
-
-          sleep 1
-        done
+        if gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${GITHUB_ACTOR}"; then
+          is_member=true
+        else
+          is_member=false
+        fi

        echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -915,30 +915,27 @@ dependencies = [

 [[package]]
 name = "bindgen"
-version = "0.70.1"
+version = "0.65.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
+checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 1.3.2",
 "cexpr",
 "clang-sys",
- "itertools 0.12.1",
+ "lazy_static",
+ "lazycell",
 "log",
- "prettyplease 0.2.17",
+ "peeking_take_while",
+ "prettyplease 0.2.6",
 "proc-macro2",
 "quote",
 "regex",
 "rustc-hash",
 "shlex",
 "syn 2.0.52",
+ "which",
 ]

-[[package]]
-name = "bit_field"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
-
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -1189,9 +1186,9 @@ dependencies = [

 [[package]]
 name = "comfy-table"
-version = "7.1.1"
+version = "6.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
+checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
 dependencies = [
 "crossterm",
 "strum",
@@ -1246,7 +1243,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1330,6 +1327,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "camino",
 "clap",
 "comfy-table",
@@ -1360,8 +1358,8 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-util",
- "toml",
- "toml_edit",
+ "toml 0.7.4",
+ "toml_edit 0.19.10",
 "tracing",
 "url",
 "utils",
@@ -1485,22 +1483,25 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"

 [[package]]
 name = "crossterm"
-version = "0.27.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
+checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 1.3.2",
 "crossterm_winapi",
 "libc",
+ "mio",
 "parking_lot 0.12.1",
+ "signal-hook",
+ "signal-hook-mio",
 "winapi",
 ]

 [[package]]
 name = "crossterm_winapi"
-version = "0.9.1"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
+checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
 dependencies = [
 "winapi",
 ]
@@ -1671,9 +1672,9 @@ dependencies = [

 [[package]]
 name = "diesel"
-version = "2.2.3"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -2721,12 +2722,6 @@ dependencies = [
 "hashbrown 0.14.5",
 ]

-[[package]]
-name = "indoc"
-version = "2.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
-
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -2943,6 +2938,23 @@ dependencies = [
 "spin 0.5.2",
 ]

+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "leaky-bucket"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
+dependencies = [
+ "parking_lot 0.12.1",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3141,7 +3153,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
 "serde",
- "toml",
+ "toml 0.8.14",
 ]

 [[package]]
@@ -3657,7 +3669,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "utils",
 "workspace_hack",
 ]
@@ -3671,7 +3683,6 @@ dependencies = [
 "async-compression",
 "async-stream",
 "async-trait",
- "bit_field",
 "byteorder",
 "bytes",
 "camino",
@@ -3695,8 +3706,8 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
- "indoc",
 "itertools 0.10.5",
+ "leaky-bucket",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -3721,7 +3732,6 @@ dependencies = [
 "reqwest 0.12.4",
 "rpds",
 "scopeguard",
- "send-future",
 "serde",
 "serde_json",
 "serde_path_to_error",
@@ -3744,7 +3754,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "twox-hash",
 "url",
@@ -3761,7 +3771,6 @@ dependencies = [
 "bincode",
 "byteorder",
 "bytes",
- "camino",
 "chrono",
 "const_format",
 "enum-map",
@@ -3769,16 +3778,11 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "itertools 0.10.5",
- "nix 0.27.1",
- "postgres_backend",
 "postgres_ffi",
 "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
 "serde",
 "serde_json",
 "serde_with",
- "storage_broker",
 "strum",
 "strum_macros",
 "thiserror",
@@ -3790,6 +3794,7 @@ name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
 "futures",
 "pageserver_api",
@@ -3907,9 +3912,8 @@ dependencies = [

 [[package]]
 name = "parquet"
-version = "53.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "ahash",
 "bytes",
@@ -3928,9 +3932,8 @@ dependencies = [

 [[package]]
 name = "parquet_derive"
-version = "53.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "parquet",
 "proc-macro2",
@@ -3967,6 +3970,12 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -4120,7 +4129,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4133,7 +4142,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4152,7 +4161,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4264,9 +4273,9 @@ dependencies = [

 [[package]]
 name = "prettyplease"
-version = "0.2.17"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
+checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
 "proc-macro2",
 "syn 2.0.52",
@@ -4811,7 +4820,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "utils",
 ]
@@ -5321,7 +5330,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-subscriber",
 "url",
@@ -5446,12 +5455,6 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"

-[[package]]
-name = "send-future"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
-
 [[package]]
 name = "sentry"
 version = "0.32.3"
@@ -5587,12 +5590,11 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.125"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
 dependencies = [
 "itoa",
- "memchr",
 "ryu",
 "serde",
 ]
@@ -5730,6 +5732,17 @@ dependencies = [
 "signal-hook-registry",
 ]

+[[package]]
+name = "signal-hook-mio"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -5936,6 +5949,7 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
 "futures",
 "pageserver_api",
@@ -6042,21 +6056,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"

 [[package]]
 name = "strum"
-version = "0.26.3"
+version = "0.24.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"

 [[package]]
 name = "strum_macros"
-version = "0.26.4"
+version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck 0.5.0",
+ "heck 0.4.1",
 "proc-macro2",
 "quote",
 "rustversion",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -6067,9 +6081,8 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"

 [[package]]
 name = "svg_fmt"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20e16a0f46cf5fd675563ef54f26e83e20f2366bcf027bcb3cc3ed2b98aaf2ca"
+version = "0.4.2"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"

 [[package]]
 name = "syn"
@@ -6397,7 +6410,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6508,6 +6521,18 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "toml"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.19.10",
+]
+
 [[package]]
 name = "toml"
 version = "0.8.14"
@@ -6517,7 +6542,7 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "toml_edit",
+ "toml_edit 0.22.14",
 ]

 [[package]]
@@ -6529,6 +6554,19 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "toml_edit"
+version = "0.19.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
+dependencies = [
+ "indexmap 1.9.3",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow 0.4.6",
+]
+
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6539,7 +6577,7 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "winnow",
+ "winnow 0.6.13",
 ]

 [[package]]
@@ -6914,6 +6952,7 @@ dependencies = [
 "anyhow",
 "arc-swap",
 "async-compression",
+ "async-trait",
 "bincode",
 "byteorder",
 "bytes",
@@ -6929,6 +6968,7 @@ dependencies = [
 "humantime",
 "hyper 0.14.26",
 "jsonwebtoken",
+ "leaky-bucket",
 "metrics",
 "nix 0.27.1",
 "once_cell",
@@ -6952,7 +6992,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -7498,6 +7538,15 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"

+[[package]]
+name = "winnow"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7567,7 +7616,6 @@ dependencies = [
 "hyper 0.14.26",
 "indexmap 1.9.3",
 "itertools 0.10.5",
- "itertools 0.12.1",
 "lazy_static",
 "libc",
 "log",
@@ -7605,7 +7653,6 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
- "toml_edit",
 "tonic",
 "tower",
 "tracing",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,8 +64,7 @@ aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
-bindgen = "0.70"
-bit_field = "0.10.2"
+bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -73,7 +72,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "7.1"
+comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
@@ -103,18 +102,18 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
-indoc = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
+leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.8"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
@@ -123,8 +122,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "53", default-features = false, features = ["zstd"] }
-parquet_derive = "53"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
@@ -146,7 +145,6 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-send-future = "0.1.0"
 sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -158,10 +156,11 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.26"
-strum_macros = "0.26"
+strum = "0.24"
+strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.3"
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -177,8 +176,8 @@ tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-toml = "0.8"
-toml_edit = "0.22"
+toml = "0.7"
+toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
@@ -201,21 +200,10 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-
-# We want to use the 'neon' branch for these, but there's currently one
-# incompatible change on the branch. See:
-#
-# - PR #8076 which contained changes that depended on the new changes in
-#   the rust-postgres crate, and
-# - PR #8654 which reverted those changes and made the code in proxy incompatible
-#   with the tip of the 'neon' branch again.
-#
-# When those proxy changes are re-applied (see PR #8747), we can switch using
-# the tip of the 'neon' branch again.
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -252,7 +240,11 @@ tonic-build = "0.9"
 [patch.crates-io]

 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+
+# bug fixes for UUID
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }

 ################# Binary contents sections

--- a/1
+++ b/1
@@ -87,7 +87,6 @@ RUN mkdir -p /data/.neon/ && \
       "pg_distrib_dir='/usr/local/'\n" \
       "listen_pg_addr='0.0.0.0:6400'\n" \
       "listen_http_addr='0.0.0.0:9898'\n" \
-       "availability_zone='local'\n" \
  > /data/.neon/pageserver.toml && \
  chown -R neon:neon /data/.neon

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.81.0
+ENV RUSTC_VERSION=1.80.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -207,7 +207,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    export PATH="$HOME/.cargo/bin:$PATH" && \
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
-    rustup component add llvm-tools rustfmt clippy && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hint_plan.patch /ext-src
+COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
-RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN patch -p1 < /ext-src/pg_hintplan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
--- a/README.md
+++ b/README.md
@@ -64,12 +64,6 @@ brew install protobuf openssl flex bison icu4c pkg-config
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```

-If you get errors about missing `m4` you may have to install it manually:
-```
-brew install m4
-brew link --force m4
-```
-
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install
@@ -132,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.


 #### Running neon database
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
@@ -367,8 +366,6 @@ fn wait_spec(
        state.start_time = now;
    }

-    launch_lsn_lease_bg_task_for_static(&compute);
-
    Ok(WaitSpecResult {
        compute,
        http_port,
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-pub mod lsn_lease;
 mod migration;
 pub mod monitor;
 pub mod params;
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -1,186 +0,0 @@
-use anyhow::bail;
-use anyhow::Result;
-use postgres::{NoTls, SimpleQueryMessage};
-use std::time::SystemTime;
-use std::{str::FromStr, sync::Arc, thread, time::Duration};
-use utils::id::TenantId;
-use utils::id::TimelineId;
-
-use compute_api::spec::ComputeMode;
-use tracing::{info, warn};
-use utils::{
-    lsn::Lsn,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
-
-use crate::compute::ComputeNode;
-
-/// Spawns a background thread to periodically renew LSN leases for static compute.
-/// Do nothing if the compute is not in static mode.
-pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
-    let (tenant_id, timeline_id, lsn) = {
-        let state = compute.state.lock().unwrap();
-        let spec = state.pspec.as_ref().expect("Spec must be set");
-        match spec.spec.mode {
-            ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
-            _ => return,
-        }
-    };
-    let compute = compute.clone();
-
-    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
-    thread::spawn(move || {
-        let _entered = span.entered();
-        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
-            // TODO: might need stronger error feedback than logging an warning.
-            warn!("Exited with error: {e}");
-        }
-    });
-}
-
-/// Renews lsn lease periodically so static compute are not affected by GC.
-fn lsn_lease_bg_task(
-    compute: Arc<ComputeNode>,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-) -> Result<()> {
-    loop {
-        let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
-        let valid_duration = valid_until
-            .duration_since(SystemTime::now())
-            .unwrap_or(Duration::ZERO);
-
-        // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
-        let sleep_duration = valid_duration
-            .saturating_sub(Duration::from_secs(60))
-            .max(valid_duration / 2);
-
-        info!(
-            "Succeeded, sleeping for {} seconds",
-            sleep_duration.as_secs()
-        );
-        thread::sleep(sleep_duration);
-    }
-}
-
-/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
-/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
-fn acquire_lsn_lease_with_retry(
-    compute: &Arc<ComputeNode>,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-) -> Result<SystemTime> {
-    let mut attempts = 0usize;
-    let mut retry_period_ms: f64 = 500.0;
-    const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
-
-    loop {
-        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let configs = {
-            let state = compute.state.lock().unwrap();
-
-            let spec = state.pspec.as_ref().expect("spec must be set");
-
-            let conn_strings = spec.pageserver_connstr.split(',');
-
-            conn_strings
-                .map(|connstr| {
-                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
-                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        info!("Got storage auth token from spec file");
-                        config.password(storage_auth_token.clone());
-                    } else {
-                        info!("Storage auth token not set");
-                    }
-                    config
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
-        match result {
-            Ok(Some(res)) => {
-                return Ok(res);
-            }
-            Ok(None) => {
-                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
-            }
-            Err(e) => {
-                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
-
-                thread::sleep(Duration::from_millis(retry_period_ms as u64));
-                retry_period_ms *= 1.5;
-                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
-            }
-        }
-        attempts += 1;
-    }
-}
-
-/// Tries to acquire an LSN lease through PS page_service API.
-fn try_acquire_lsn_lease(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-    configs: &[postgres::Config],
-) -> Result<Option<SystemTime>> {
-    fn get_valid_until(
-        config: &postgres::Config,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<Option<SystemTime>> {
-        let mut client = config.connect(NoTls)?;
-        let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
-        let res = client.simple_query(&cmd)?;
-        let msg = match res.first() {
-            Some(msg) => msg,
-            None => bail!("empty response"),
-        };
-        let row = match msg {
-            SimpleQueryMessage::Row(row) => row,
-            _ => bail!("error parsing lsn lease response"),
-        };
-
-        // Note: this will be None if a lease is explicitly not granted.
-        let valid_until_str = row.get("valid_until");
-
-        let valid_until = valid_until_str.map(|s| {
-            SystemTime::UNIX_EPOCH
-                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
-                .expect("Time larger than max SystemTime could handle")
-        });
-        Ok(valid_until)
-    }
-
-    let shard_count = configs.len();
-
-    let valid_until = if shard_count > 1 {
-        configs
-            .iter()
-            .enumerate()
-            .map(|(shard_number, config)| {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount::new(shard_count as u8),
-                    shard_number: ShardNumber(shard_number as u8),
-                };
-                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
-            })
-            .collect::<Result<Vec<Option<SystemTime>>>>()?
-            .into_iter()
-            .min()
-            .unwrap()
-    } else {
-        get_valid_until(
-            &configs[0],
-            TenantShardId::unsharded(tenant_id),
-            timeline_id,
-            lsn,
-        )?
-    };
-
-    Ok(valid_until)
-}
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -22,10 +22,9 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Escape a string for including it in a SQL literal.
-///
-/// Wrapping the result with `E'{}'` or `'{}'` is not required,
-/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
+/// Escape a string for including it in a SQL literal. Wrapping the result
+/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
+/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
 /// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -640,8 +640,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
        }
        Some(("branch", branch_match)) => {
            let tenant_id = get_tenant_id(branch_match, env)?;
-            let new_timeline_id =
-                parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
            let new_branch_name = branch_match
                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;
@@ -660,6 +658,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
+            let new_timeline_id = TimelineId::generate();
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
@@ -1571,6 +1570,7 @@ fn cli() -> Command {
                        .value_parser(value_parser!(PathBuf))
                        .value_name("config")
                )
+                .arg(pg_version_arg.clone())
                .arg(force_arg)
        )
        .subcommand(
@@ -1583,7 +1583,6 @@ fn cli() -> Command {
            .subcommand(Command::new("branch")
                .about("Create a new timeline, using another timeline as a base, copying its data")
                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,9 +165,6 @@ pub struct NeonStorageControllerConf {
    pub split_threshold: Option<u64>,

    pub max_secondary_lag_bytes: Option<u64>,
-
-    #[serde(with = "humantime_serde")]
-    pub heartbeat_interval: Duration,
 }

 impl NeonStorageControllerConf {
@@ -175,9 +172,6 @@ impl NeonStorageControllerConf {
    const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);

    const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
-
-    // Very tight heartbeat interval to speed up tests
-    const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
 }

 impl Default for NeonStorageControllerConf {
@@ -189,7 +183,6 @@ impl Default for NeonStorageControllerConf {
            database_url: None,
            split_threshold: None,
            max_secondary_lag_bytes: None,
-            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -75,14 +75,14 @@ impl PageServerNode {
        }
    }

-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
-        toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
+        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
    }

    fn pageserver_init_make_toml(
        &self,
        conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::DocumentMut> {
+    ) -> anyhow::Result<toml_edit::Document> {
        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");

        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -137,9 +137,9 @@ impl PageServerNode {

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::DocumentMut::new();
+        let mut config_toml = toml_edit::Document::new();
        for fragment_str in overrides {
-            let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
+            let fragment = toml_edit::Document::from_str(&fragment_str)
                .expect("all fragments in `overrides` are valid toml documents, this function controls that");
            for (key, item) in fragment.iter() {
                config_toml.insert(key, item.clone());
@@ -181,23 +181,6 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

-        // If the config file we got as a CLI argument includes the `availability_zone`
-        // config, then use that to populate the `metadata.json` file for the pageserver.
-        // In production the deployment orchestrator does this for us.
-        let az_id = conf
-            .other
-            .get("availability_zone")
-            .map(|toml| {
-                let az_str = toml.to_string();
-                // Trim the (") chars from the toml representation
-                if az_str.starts_with('"') && az_str.ends_with('"') {
-                    az_str[1..az_str.len() - 1].to_string()
-                } else {
-                    az_str
-                }
-            })
-            .unwrap_or("local".to_string());
-
        let config = self
            .pageserver_init_make_toml(conf)
            .context("make pageserver toml")?;
@@ -233,7 +216,6 @@ impl PageServerNode {
        let (_http_host, http_port) =
            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
        let http_port = http_port.unwrap_or(9898);
-
        // Intentionally hand-craft JSON: this acts as an implicit format compat test
        // in case the pageserver-side structure is edited, and reflects the real life
        // situation: the metadata is written by some other script.
@@ -244,10 +226,7 @@ impl PageServerNode {
                postgres_port: self.pg_connection_config.port(),
                http_host: "localhost".to_string(),
                http_port,
-                other: HashMap::from([(
-                    "availability_zone_id".to_string(),
-                    serde_json::json!(az_id),
-                )]),
+                other: HashMap::new(),
            })
            .unwrap(),
        )
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,7 +5,6 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
-use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
 use std::time::Duration;
@@ -35,10 +34,12 @@ pub enum SafekeeperHttpError {

 type Result<T> = result::Result<T, SafekeeperHttpError>;

-pub(crate) trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
+    async fn error_from_body(self) -> Result<Self>;
 }

+#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
    async fn error_from_body(self) -> Result<Self> {
        let status = self.status();
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -437,8 +437,6 @@ impl StorageController {
            &humantime::Duration::from(self.config.max_offline).to_string(),
            "--max-warming-up-interval",
            &humantime::Duration::from(self.config.max_warming_up).to_string(),
-            "--heartbeat-interval",
-            &humantime::Duration::from(self.config.heartbeat_interval).to_string(),
            "--address-for-peers",
            &address_for_peers.to_string(),
        ]
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -41,8 +41,6 @@ enum Command {
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
-        #[arg(long)]
-        availability_zone_id: String,
    },

    /// Modify a node's configuration in the storage controller
@@ -80,10 +78,7 @@ enum Command {
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
-    Tenants {
-        /// If this field is set, it will list the tenants on a specific node
-        node_id: Option<NodeId>,
-    },
+    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
@@ -152,9 +147,9 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
    // outside of the specified set.
-    BulkMigrate {
+    Drain {
        // Set of pageserver node ids to drain.
        #[arg(long)]
        nodes: Vec<NodeId>,
@@ -168,34 +163,6 @@ enum Command {
        #[arg(long)]
        dry_run: Option<bool>,
    },
-    /// Start draining the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartDrain {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel draining the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelDrain {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
-    /// Start filling the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartFill {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel filling the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelFill {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
 }

 #[derive(Parser)]
@@ -282,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-async fn wait_for_scheduling_policy<F>(
-    client: Client,
-    node_id: NodeId,
-    timeout: Duration,
-    f: F,
-) -> anyhow::Result<NodeSchedulingPolicy>
-where
-    F: Fn(NodeSchedulingPolicy) -> bool,
-{
-    let waiter = tokio::time::timeout(timeout, async move {
-        loop {
-            let node = client
-                .dispatch::<(), NodeDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}"),
-                    None,
-                )
-                .await?;
-
-            if f(node.scheduling) {
-                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
-            }
-        }
-    });
-
-    Ok(waiter.await??)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -327,7 +266,6 @@ async fn main() -> anyhow::Result<()> {
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
-            availability_zone_id,
        } => {
            storcon_client
                .dispatch::<_, ()>(
@@ -339,7 +277,6 @@ async fn main() -> anyhow::Result<()> {
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
-                        availability_zone_id,
                    }),
                )
                .await?;
@@ -406,41 +343,7 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::Tenants {
-            node_id: Some(node_id),
-        } => {
-            let describe_response = storcon_client
-                .dispatch::<(), NodeShardResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}/shards"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "Shard",
-                "Intended Primary/Secondary",
-                "Observed Primary/Secondary",
-            ]);
-            for shard in shards {
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    match shard.is_intended_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                    match shard.is_observed_secondary {
-                        None => "".to_string(),
-                        Some(true) => "Secondary".to_string(),
-                        Some(false) => "Primary".to_string(),
-                    },
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::Tenants { node_id: None } => {
+        Command::Tenants {} => {
            let mut resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
@@ -725,7 +628,7 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::BulkMigrate {
+        Command::Drain {
            nodes,
            concurrency,
            max_shards,
@@ -754,7 +657,7 @@ async fn main() -> anyhow::Result<()> {
            }

            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
+                anyhow::bail!("Drain requested for node which doesn't exist.")
            }

            node_to_fill_descs.retain(|desc| {
@@ -766,7 +669,7 @@ async fn main() -> anyhow::Result<()> {
            });

            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to migrate to")
+                anyhow::bail!("There are no nodes to drain to")
            }

            // Set the node scheduling policy to draining for the nodes which
@@ -787,7 +690,7 @@ async fn main() -> anyhow::Result<()> {
                    .await?;
            }

-            // Perform the migration: move each tenant shard scheduled on a node to
+            // Perform the drain: move each tenant shard scheduled on a node to
            // be drained to a node which is being filled. A simple round robin
            // strategy is used to pick the new node.
            let tenants = storcon_client
@@ -800,13 +703,13 @@ async fn main() -> anyhow::Result<()> {

            let mut selected_node_idx = 0;

-            struct MigrationMove {
+            struct DrainMove {
                tenant_shard_id: TenantShardId,
                from: NodeId,
                to: NodeId,
            }

-            let mut moves: Vec<MigrationMove> = Vec::new();
+            let mut moves: Vec<DrainMove> = Vec::new();

            let shards = tenants
                .into_iter()
@@ -836,7 +739,7 @@ async fn main() -> anyhow::Result<()> {
                    continue;
                }

-                moves.push(MigrationMove {
+                moves.push(DrainMove {
                    tenant_shard_id: shard.tenant_shard_id,
                    from: shard
                        .node_attached
@@ -913,67 +816,6 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
-        Command::StartDrain { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-            println!("Drain started for {node_id}");
-        }
-        Command::CancelDrain { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active | PauseForRestart)
-                })
-                .await?;
-
-            println!(
-                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
-        Command::StartFill { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
-                .await?;
-
-            println!("Fill started for {node_id}");
-        }
-        Command::CancelFill { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/fill"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active)
-                })
-                .await?;
-
-            println!(
-                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
    }

    Ok(())
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -3,7 +3,7 @@ set -x

 cd /ext-src || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
       [ -d "${d}" ] || continue
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -441,6 +441,11 @@ WAL-log them periodically, from a backgound worker.

 Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged

+FIXME: But they're not, AFAICS?
+
+FIXME: However, we do WAL-log the file in pg_logical/mappings. But AFAICS that's WAL-logged
+by PostgreSQL too. Why do we need separate WAL-logging for that? See changes in rewriteheap.c
+
 ### How to get rid of the patch

 WAL-log them periodically, from a backgound worker.
--- a/docs/rfcs/037-storage-controller-restarts.md
+++ b/docs/rfcs/037-storage-controller-restarts.md
@@ -1,259 +0,0 @@
-# Rolling Storage Controller Restarts
-
-## Summary
-
-This RFC describes the issues around the current storage controller restart procedure
-and describes an implementation which reduces downtime to a few milliseconds on the happy path.
-
-## Motivation
-
-Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
-While the storage controller does not sit on the main data path, it's generally not acceptable
-to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
-
-### Current Implementation
-
-The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
-In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
-a new instance is created.
-
-At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
-latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
-under unfavourable circumstances: pageservers are heavily loaded or unavailable.
-
-## Prior Art
-
-There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
-* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
-For fail-over, traffic is routed to one of the standbys (which becomes active).
-* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
-and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
-
-## Requirements
-
-* Reduce storage controller unavailability during upgrades to milliseconds
-* Minimize the interval in which it's possible for more than one storage controller
-to issue reconciles.
-* Have one uniform implementation for restarts and upgrades
-* Fit in with the current Kubernetes deployment scheme
-
-## Non Goals
-
-* Implement our own consensus algorithm from scratch
-* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
-like a transient error to the control plane
-
-## Impacted Components
-
-* storage controller
-* deployment orchestration (i.e. Ansible)
-* helm charts
-
-## Terminology
-
-* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
-at start-up by quering pageservers
-* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
-a set of replicas
-
-## Implementation
-
-### High Level Flow
-
-At a very high level the proposed idea is to start a new storage controller instance while
-the previous one is still running and cut-over to it when it becomes ready. The new instance,
-should coordinate with the existing one and transition responsibility gracefully. While the controller
-has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
-scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
-were operating at the same time and require operator intervention to remedy.
-
-### Kubernetes Deployment Configuration
-
-On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
-to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
-Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
-scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
-
-The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
-
-### Storage Controller Start-Up
-
-This section describes the primitives required on the storage controller side and the flow of the happy path.
-
-#### Database Table For Leader Synchronization
-
-A new table should be added to the storage controller database for leader synchronization during startup.
-This table will always contain at most one row. The proposed name for the table is `leader` and the schema
-contains two elements:
-* `hostname`: represents the hostname for the current storage controller leader - should be addressible
-from other pods in the deployment
-* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
-for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
-
-Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
-at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
-situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
-level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
-READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
-the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
-our needs here.
-
-```
-START TRANSACTION ISOLATION LEVEL REPEATABLE READ
-UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
-WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
-```
-
-If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
-
-#### Step Down API
-
-A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
-request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
-and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
-snapshot of the observed state.
-
-If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
-for failure scenario handling - see [Handling Failures](#handling-failures)).
-
-#### Graceful Restart Happy Path
-
-At start-up, the first thing the storage controller does is retrieve the sole row from the new
-`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
-This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
-observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
-pageservers in order to build up the observed state.
-
-Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
-section. If this step fails, the storage controller process exits.
-
-Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
-(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
-
-Summary of proposed new start-up sequence:
-1. Call `/step_down`
-2. Perform any pending database migrations
-3. Load state from database
-4. Load observed state returned in step (1) into memory
-5. Do initial heartbeat round (may be moved after 5)
-7. Mark self as leader by updating the database
-8. Reschedule and reconcile everything
-
-Some things to note from the steps above:
-* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
-calls to the pageserver and no compute notifications)
-* Ask the current leader to step down before loading state from database so we don't get a lost update
-if the transactions overlap.
-* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
-fall back to asking the pageservers about their current locations.
-* Database migrations should only run **after** the previous instance steps down (or the step down times out).
-
-
-[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
-so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
-
-### Handling Failures
-
-#### Storage Controller Crash Or Restart
-
-The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
-`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
-start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
-exists and consistency is maintained.
-
-#### Previous Leader Crashes Before New Leader Readiness
-
-When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
-reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
-(see [2]).
-
-Now we have two cases to consider:
-* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
-by Kubernetes depending on timings.
-* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
-The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
-create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
-
-[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
-should avoid this self reference and fail the API call at the client if the persisted hostname matches
-the current one.
-
-#### Previous Leader Crashes After New Leader Readiness
-
-The deployment's replica sets already satisfy the deployment's replica count requirements and the
-Kubernetes deployment rollout will just clean up the dead pod.
-
-#### New Leader Crashes Before Pasing Readiness Check
-
-The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
-with the new pod.
-
-#### Network Partition Between New Pod and Previous Leader
-
-This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
-API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
-Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
-
-### Dealing With Split Brain Scenarios
-
-As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
-duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
-scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
-The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
-
-### Ensure Leadership Before Producing Side Effects
-
-The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
-Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
-applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
-
-### Leadership Lease
-
-Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
-to be renewed periodically. Two new columns would be added to the leaders table:
-1. `last_renewed` - timestamp indicating when the lease was last renewed
-2. `lease_duration` - duration indicating the amount of time after which the lease expires
-
-The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
-same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
-to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
-
-### Notify Pageserver Of Storage Controller Term
-
-Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
-Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
-anything which contains a stale term (i.e. smaller than the current one).
-
-### Observability
-
-* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
-Per region alerts should be added on this metric which triggers when:
-  + no storage controller has been in the `Active` state for an extended period of time
-  + more than one storage controllers are in the `Active` state
-
-* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
-We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
-
-## Alternatives
-
-### Kubernetes Leases
-
-Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
-Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
-
-In our case, it would work something like this:
-* `/step_down` deletes the lease or stops it from renewing
-* lease acquisition becomes part of the start-up procedure
-
-The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
-not exactly trivial to implement.
-
-This approach has the benefit of baked in observability (`kubectl describe lease`), but:
-* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
-* More code surface than the simple "row in database" approach. Also, most of this code would be in
-a dependency not subject to code review, etc.
-* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
-so is not simple and complictes and the test set-up.
-
-To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
-to something external.
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.

    ```shell
-    git checkout -b my-branch-15 REL_15_STABLE_neon
+    git checkout -b my-branch REL_15_STABLE_neon
    ```

-1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
+1. Tag the last commit on the stable branch you are updating.

-1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.

    ```shell
    git fetch upstream REL_15_4
-    git merge REL_15_4
+    git rebase REL_15_4
    ```

-    In the commit message of the merge commit, mention if there were
-    any non-trivial conflicts or other issues.
-
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.

@@ -48,7 +57,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15
+    git push origin my-branch
    ```

 1. Clone the Neon repository if you have not done so already.
@@ -65,7 +74,7 @@ branch.
 1. Update the Git submodule.

    ```shell
-    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
+    git submodule set-branch --branch my-branch vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```

@@ -80,12 +89,14 @@ minor Postgres release.

 1. Create a pull request, and wait for CI to go green.

-1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
+1. Force push the rebased Postgres branches into the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15:REL_15_STABLE_neon
+    git push --force origin my-branch:REL_15_STABLE_neon
    ```

+    It may require disabling various branch protections.
+
 1. Update your Neon PR to point at the branches.

    ```shell
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -68,7 +68,6 @@ macro_rules! register_uint_gauge {
 static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);

 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
-///
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
 pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -4,10 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-# See pageserver/Cargo.toml
-testing = ["dep:nix"]
-
 [dependencies]
 serde.workspace = true
 serde_with.workspace = true
@@ -27,12 +23,6 @@ thiserror.workspace = true
 humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
-storage_broker.workspace = true
-camino = {workspace = true, features = ["serde1"]}
-remote_storage.workspace = true
-postgres_backend.workspace = true
-nix = {workspace = true, optional = true}
-reqwest.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,28 +1,15 @@
-use camino::Utf8PathBuf;
+use std::collections::HashMap;
+
+use const_format::formatcp;

 #[cfg(test)]
 mod tests;

-use const_format::formatcp;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

-use postgres_backend::AuthType;
-use remote_storage::RemoteStorageConfig;
-use serde_with::serde_as;
-use std::{
-    collections::HashMap,
-    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
-    time::Duration,
-};
-use utils::logging::LogFormat;
-
-use crate::models::ImageCompressionAlgorithm;
-use crate::models::LsnLease;
-
 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
 // as a separate structure.  This information is not neeed by the pageserver
 // itself, it is only used for registering the pageserver with the control
@@ -42,476 +29,3 @@ pub struct NodeMetadata {
    #[serde(flatten)]
    pub other: HashMap<String, serde_json::Value>,
 }
-
-/// `pageserver.toml`
-///
-/// We use serde derive with `#[serde(default)]` to generate a deserializer
-/// that fills in the default values for each config field.
-///
-/// If there cannot be a static default value because we need to make runtime
-/// checks to determine the default, make it an `Option` (which defaults to None).
-/// The runtime check should be done in the consuming crate, i.e., `pageserver`.
-#[serde_as]
-#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
-#[serde(default, deny_unknown_fields)]
-pub struct ConfigToml {
-    // types mapped 1:1 into the runtime PageServerConfig type
-    pub listen_pg_addr: String,
-    pub listen_http_addr: String,
-    pub availability_zone: Option<String>,
-    #[serde(with = "humantime_serde")]
-    pub wait_lsn_timeout: Duration,
-    #[serde(with = "humantime_serde")]
-    pub wal_redo_timeout: Duration,
-    pub superuser: String,
-    pub page_cache_size: usize,
-    pub max_file_descriptors: usize,
-    pub pg_distrib_dir: Option<Utf8PathBuf>,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub http_auth_type: AuthType,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub pg_auth_type: AuthType,
-    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
-    pub remote_storage: Option<RemoteStorageConfig>,
-    pub tenant_config: TenantConfigToml,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub broker_endpoint: storage_broker::Uri,
-    #[serde(with = "humantime_serde")]
-    pub broker_keepalive_interval: Duration,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub log_format: LogFormat,
-    pub concurrent_tenant_warmup: NonZeroUsize,
-    pub concurrent_tenant_size_logical_size_queries: NonZeroUsize,
-    #[serde(with = "humantime_serde")]
-    pub metric_collection_interval: Duration,
-    pub metric_collection_endpoint: Option<reqwest::Url>,
-    pub metric_collection_bucket: Option<RemoteStorageConfig>,
-    #[serde(with = "humantime_serde")]
-    pub synthetic_size_calculation_interval: Duration,
-    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
-    pub test_remote_failures: u64,
-    pub ondemand_download_behavior_treat_error_as_warn: bool,
-    #[serde(with = "humantime_serde")]
-    pub background_task_maximum_delay: Duration,
-    pub control_plane_api: Option<reqwest::Url>,
-    pub control_plane_api_token: Option<String>,
-    pub control_plane_emergency_mode: bool,
-    pub heatmap_upload_concurrency: usize,
-    pub secondary_download_concurrency: usize,
-    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
-    pub ingest_batch_size: u64,
-    pub max_vectored_read_bytes: MaxVectoredReadBytes,
-    pub image_compression: ImageCompressionAlgorithm,
-    pub ephemeral_bytes_per_memory_kb: usize,
-    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    #[serde(skip_serializing)]
-    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
-    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
-    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
-    pub io_buffer_alignment: usize,
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields)]
-pub struct DiskUsageEvictionTaskConfig {
-    pub max_usage_pct: utils::serde_percent::Percent,
-    pub min_avail_bytes: u64,
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[cfg(feature = "testing")]
-    pub mock_statvfs: Option<statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
-pub mod statvfs {
-    pub mod mock {
-        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-        #[serde(tag = "type")]
-        pub enum Behavior {
-            Success {
-                blocksize: u64,
-                total_blocks: u64,
-                name_filter: Option<utils::serde_regex::Regex>,
-            },
-            #[cfg(feature = "testing")]
-            Failure { mocked_error: MockedError },
-        }
-
-        #[cfg(feature = "testing")]
-        #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-        #[allow(clippy::upper_case_acronyms)]
-        pub enum MockedError {
-            EIO,
-        }
-
-        #[cfg(feature = "testing")]
-        impl From<MockedError> for nix::Error {
-            fn from(e: MockedError) -> Self {
-                match e {
-                    MockedError::EIO => nix::Error::EIO,
-                }
-            }
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(tag = "type", content = "args")]
-pub enum EvictionOrder {
-    RelativeAccessed {
-        highest_layer_count_loses_first: bool,
-    },
-}
-
-impl Default for EvictionOrder {
-    fn default() -> Self {
-        Self::RelativeAccessed {
-            highest_layer_count_loses_first: true,
-        }
-    }
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
-#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(transparent)]
-pub struct MaxVectoredReadBytes(pub NonZeroUsize);
-
-/// A tenant's calcuated configuration, which is the result of merging a
-/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
-///
-/// For storing and transmitting individual tenant's configuration, see
-/// TenantConfOpt.
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields, default)]
-pub struct TenantConfigToml {
-    // Flush out an inmemory layer, if it's holding WAL older than this
-    // This puts a backstop on how much WAL needs to be re-digested if the
-    // page server crashes.
-    // This parameter actually determines L0 layer file size.
-    pub checkpoint_distance: u64,
-    // Inmemory layer is also flushed at least once in checkpoint_timeout to
-    // eventually upload WAL after activity is stopped.
-    #[serde(with = "humantime_serde")]
-    pub checkpoint_timeout: Duration,
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub compaction_target_size: u64,
-    // How often to check if there's compaction work to be done.
-    // Duration::ZERO means automatic compaction is disabled.
-    #[serde(with = "humantime_serde")]
-    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
-    pub compaction_threshold: usize,
-    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is #of bytes of WAL.
-    // Page versions older than this are garbage collected away.
-    pub gc_horizon: u64,
-    // Interval at which garbage collection is triggered.
-    // Duration::ZERO means automatic GC is disabled
-    #[serde(with = "humantime_serde")]
-    pub gc_period: Duration,
-    // Delta layer churn threshold to create L1 image layers.
-    pub image_creation_threshold: usize,
-    // Determines how much history is retained, to allow
-    // branching and read replicas at an older point in time.
-    // The unit is time.
-    // Page versions older than this are garbage collected away.
-    #[serde(with = "humantime_serde")]
-    pub pitr_interval: Duration,
-    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
-    #[serde(with = "humantime_serde")]
-    pub walreceiver_connect_timeout: Duration,
-    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
-    /// A stalled safekeeper will be changed to a newer one when it appears.
-    #[serde(with = "humantime_serde")]
-    pub lagging_wal_timeout: Duration,
-    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
-    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
-    /// to avoid eager reconnects.
-    pub max_lsn_wal_lag: NonZeroU64,
-    pub eviction_policy: crate::models::EvictionPolicy,
-    pub min_resident_size_override: Option<u64>,
-    // See the corresponding metric's help string.
-    #[serde(with = "humantime_serde")]
-    pub evictions_low_residence_duration_metric_threshold: Duration,
-
-    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
-    /// may be disabled if a Tenant will not have secondary locations: only secondary
-    /// locations will use the heatmap uploaded by attached locations.
-    #[serde(with = "humantime_serde")]
-    pub heatmap_period: Duration,
-
-    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
-    pub lazy_slru_download: bool,
-
-    pub timeline_get_throttle: crate::models::ThrottleConfig,
-
-    // How much WAL must be ingested before checking again whether a new image layer is required.
-    // Expresed in multiples of checkpoint distance.
-    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
-
-    /// The length for an explicit LSN lease request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length: Duration,
-
-    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
-    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
-    #[serde(with = "humantime_serde")]
-    pub lsn_lease_length_for_ts: Duration,
-}
-
-pub mod defaults {
-    use crate::models::ImageCompressionAlgorithm;
-
-    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
-
-    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
-    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
-
-    pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
-
-    pub const DEFAULT_LOG_FORMAT: &str = "plain";
-
-    pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8;
-
-    pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1;
-
-    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
-    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
-
-    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
-    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-
-    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
-
-    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
-        ImageCompressionAlgorithm::Zstd { level: Some(1) };
-
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
-    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
-
-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
-}
-
-impl Default for ConfigToml {
-    fn default() -> Self {
-        use defaults::*;
-
-        Self {
-            listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
-            listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
-            availability_zone: (None),
-            wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
-                .expect("cannot parse default wait lsn timeout")),
-            wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
-                .expect("cannot parse default wal redo timeout")),
-            superuser: (DEFAULT_SUPERUSER.to_string()),
-            page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
-            max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
-            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
-            http_auth_type: (AuthType::Trust),
-            pg_auth_type: (AuthType::Trust),
-            auth_validation_public_key_path: (None),
-            remote_storage: None,
-            broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
-                .parse()
-                .expect("failed to parse default broker endpoint")),
-            broker_keepalive_interval: (humantime::parse_duration(
-                storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
-            )
-            .expect("cannot parse default keepalive interval")),
-            log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
-
-            concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
-                .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
-            metric_collection_interval: (humantime::parse_duration(
-                DEFAULT_METRIC_COLLECTION_INTERVAL,
-            )
-            .expect("cannot parse default metric collection interval")),
-            synthetic_size_calculation_interval: (humantime::parse_duration(
-                DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
-            )
-            .expect("cannot parse default synthetic size calculation interval")),
-            metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT),
-
-            metric_collection_bucket: (None),
-
-            disk_usage_based_eviction: (None),
-
-            test_remote_failures: (0),
-
-            ondemand_download_behavior_treat_error_as_warn: (false),
-
-            background_task_maximum_delay: (humantime::parse_duration(
-                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
-            )
-            .unwrap()),
-
-            control_plane_api: (None),
-            control_plane_api_token: (None),
-            control_plane_emergency_mode: (false),
-
-            heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-            secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
-
-            ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE),
-
-            virtual_file_io_engine: None,
-
-            max_vectored_read_bytes: (MaxVectoredReadBytes(
-                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
-            )),
-            image_compression: (DEFAULT_IMAGE_COMPRESSION),
-            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
-            l0_flush: None,
-            compact_level0_phase1_value_access: Default::default(),
-            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
-
-            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
-
-            tenant_config: TenantConfigToml::default(),
-        }
-    }
-}
-
-pub mod tenant_conf_defaults {
-
-    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
-    // would be more appropriate. But a low value forces the code to be exercised more,
-    // which is good for now to trigger bugs.
-    // This parameter actually determines L0 layer file size.
-    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
-
-    // FIXME the below configs are only used by legacy algorithm. The new algorithm
-    // has different parameters.
-
-    // Target file size, when creating image and delta layers.
-    // This parameter determines L1 layer file size.
-    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
-
-    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
-    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
-        crate::models::CompactionAlgorithm::Legacy;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-
-    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
-    // If there's a need to decrease this value, first make sure that GC
-    // doesn't hold a layer map write lock for non-trivial operations.
-    // Relevant: https://github.com/neondatabase/neon/issues/3394
-    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
-    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
-    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
-    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
-    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
-    // throughputs up to 1GiB/s per timeline.
-    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-    // By default ingest enough WAL for two new L0 layers before checking if new image
-    // image layers should be created.
-    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-}
-
-impl Default for TenantConfigToml {
-    fn default() -> Self {
-        use tenant_conf_defaults::*;
-        Self {
-            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
-            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
-                .expect("cannot parse default checkpoint timeout"),
-            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
-            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
-                .expect("cannot parse default compaction period"),
-            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
-            gc_horizon: DEFAULT_GC_HORIZON,
-            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
-                .expect("cannot parse default gc period"),
-            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
-                .expect("cannot parse default PITR interval"),
-            walreceiver_connect_timeout: humantime::parse_duration(
-                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
-            )
-            .expect("cannot parse default walreceiver connect timeout"),
-            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
-                .expect("cannot parse default walreceiver lagging wal timeout"),
-            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
-                .expect("cannot parse default max walreceiver Lsn wal lag"),
-            eviction_policy: crate::models::EvictionPolicy::NoEviction,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
-                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
-            )
-            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            heatmap_period: Duration::ZERO,
-            lazy_slru_download: false,
-            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
-            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
-            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
-            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
-        }
-    }
-}
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};

-use crate::models::PageserverUtilization;
 use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
@@ -56,8 +55,6 @@ pub struct NodeRegisterRequest {

    pub listen_http_addr: String,
    pub listen_http_port: u16,
-
-    pub availability_zone_id: String,
 }

 #[derive(Serialize, Deserialize)]
@@ -74,17 +71,6 @@ pub struct TenantPolicyRequest {
    pub scheduling: Option<ShardSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsRequest {
-    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, String>,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct ShardsPreferredAzsResponse {
-    pub updated: Vec<TenantShardId>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -112,21 +98,6 @@ pub struct TenantDescribeResponse {
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShardResponse {
-    pub node_id: NodeId,
-    pub shards: Vec<NodeShard>,
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-pub struct NodeShard {
-    pub tenant_shard_id: TenantShardId,
-    /// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_observed_secondary: Option<bool>,
-    /// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
-    pub is_intended_secondary: Option<bool>,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
@@ -158,12 +129,8 @@ pub struct TenantDescribeResponseShard {
    pub is_splitting: bool,

    pub scheduling_policy: ShardSchedulingPolicy,
-
-    pub preferred_az_id: Option<String>,
 }

-/// Migration request for a given tenant shard to a given node.
-///
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -173,11 +140,23 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-#[derive(Serialize, Clone, Debug)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(PageserverUtilization),
+    Active(UtilizationScore),
    // Node is warming up, but we expect it to become available soon. Covers
    // the time span between the re-attach response being composed on the storage controller
    // and the first successful heartbeat after the processing of the re-attach response
@@ -216,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
-            NodeAvailabilityWrapper::Active => {
-                NodeAvailability::Active(PageserverUtilization::full())
-            }
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -108,41 +108,14 @@ impl Key {
        }
    }

-    /// This function checks more extensively what keys we can take on the write path.
-    /// If a key beginning with 00 does not have a global/default tablespace OID, it
-    /// will be rejected on the write path.
-    #[allow(dead_code)]
-    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
-        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-        if !self.is_i128_representable() {
-            return false;
-        }
-        if self.field1 == 0
-            && !(self.field2 == GLOBALTABLESPACE_OID
-                || self.field2 == DEFAULTTABLESPACE_OID
-                || self.field2 == 0)
-        {
-            return false; // User defined tablespaces are not supported
-        }
-        true
-    }
-
-    /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
-    /// checks if the key is i128 representable. Note that some keys can be successfully
-    /// ingested into the pageserver, but will cause errors on generating basebackup.
-    pub fn is_valid_key_on_write_path(&self) -> bool {
-        self.is_i128_representable()
-    }
-
-    pub fn is_i128_representable(&self) -> bool {
-        self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
-    }
-
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.is_i128_representable(), "invalid key: {self}");
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,9 +6,8 @@ pub use utilization::PageserverUtilization;

 use std::{
    collections::HashMap,
-    fmt::Display,
    io::{BufRead, Read},
-    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
+    num::{NonZeroU64, NonZeroUsize},
    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
@@ -62,7 +61,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::VariantNames,
+    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
@@ -305,10 +304,8 @@ pub struct TenantConfig {
    pub lsn_lease_length_for_ts: Option<String>,
 }

-/// The policy for the aux file storage.
-///
-/// It can be switched through `switch_aux_file_policy` tenant config.
-/// When the first aux file written, the policy will be persisted in the
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
 /// `index_part.json` file and has a limited migration path.
 ///
 /// Currently, we only allow the following migration path:
@@ -438,9 +435,7 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

-#[derive(
-    Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay,
-)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum ImageCompressionAlgorithm {
    // Disabled for writes, support decompressing during read path
    Disabled,
@@ -475,33 +470,11 @@ impl FromStr for ImageCompressionAlgorithm {
    }
 }

-impl Display for ImageCompressionAlgorithm {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
-            ImageCompressionAlgorithm::Zstd { level } => {
-                if let Some(level) = level {
-                    write!(f, "zstd({})", level)
-                } else {
-                    write!(f, "zstd")
-                }
-            }
-        }
-    }
-}
-
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
    pub kind: CompactionAlgorithm,
 }

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum L0FlushConfig {
-    #[serde(rename_all = "snake_case")]
-    Direct { max_concurrency: NonZeroUsize },
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -513,11 +486,12 @@ pub struct EvictionPolicyLayerAccessThreshold {
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
    pub task_kinds: Vec<String>, // TaskKind
-    pub initial: u32,
+    pub initial: usize,
    #[serde(with = "humantime_serde")]
    pub refill_interval: Duration,
-    pub refill_amount: NonZeroU32,
-    pub max: u32,
+    pub refill_amount: NonZeroUsize,
+    pub max: usize,
+    pub fair: bool,
 }

 impl ThrottleConfig {
@@ -527,8 +501,9 @@ impl ThrottleConfig {
            // other values don't matter with emtpy `task_kinds`.
            initial: 0,
            refill_interval: Duration::from_millis(1),
-            refill_amount: NonZeroU32::new(1).unwrap(),
+            refill_amount: NonZeroUsize::new(1).unwrap(),
            max: 1,
+            fair: true,
        }
    }
    /// The requests per second allowed  by the given config.
@@ -746,14 +721,8 @@ pub struct TimelineInfo {

    pub walreceiver_status: String,

-    // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility.
-    // Backward compatibility: you will get a JSON not containing the newly-added field.
-    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
-    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
-    // read.
    /// The last aux file policy being used on this timeline
    pub last_aux_file_policy: Option<AuxFilePolicy>,
-    pub is_archived: Option<bool>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -898,9 +867,7 @@ pub struct WalRedoManagerStatus {
    pub process: Option<WalRedoManagerProcessStatus>,
 }

-/// The progress of a secondary tenant.
-///
-/// It is mostly useful when doing a long running download: e.g. initiating
+/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
@@ -1095,7 +1062,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// A GetPage request contains two LSN values:
+// In the V2 protocol version, a GetPage request contains two LSN values:
 //
 // request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
 // "get the latest version present". It's used by the primary server, which knows that no one else
@@ -1108,7 +1075,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
 // request without waiting for 'request_lsn' to arrive.
 //
-// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
 // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
 // 'latest' was set to true. The V2 interface was added because there was no correct way for a
 // standby to request a page at a particular non-latest LSN, and also include the
@@ -1116,11 +1083,15 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // request, if the standby knows that the page hasn't been modified since, and risk getting an error
 // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
 // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
+// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
 // difference in the responses between V1 and V2.
 //
+// The Request structs below reflect the V2 interface. If V1 is used, the parse function
+// maps the old format requests to the new format.
+//
 #[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
+    V1,
    V2,
 }

@@ -1259,17 +1230,36 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;

-        // these two fields are the same for every request type
-        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
+        let (request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V1 => {
+                // In the old protocol, each message starts with a boolean 'latest' flag,
+                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
+                // 'not_modified_since', used in the new protocol version.
+                let latest = body.read_u8()? != 0;
+                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+                if latest {
+                    (Lsn::MAX, request_lsn) // get latest version
+                } else {
+                    (request_lsn, request_lsn) // get version at specified LSN
+                }
+            }
+        };

+        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
                request_lsn,
@@ -1477,7 +1467,9 @@ mod tests {
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
+                    .unwrap();
            assert!(msg == reconstructed);
        }
    }
@@ -1685,33 +1677,21 @@ mod tests {
    #[test]
    fn test_image_compression_algorithm_parsing() {
        use ImageCompressionAlgorithm::*;
-        let cases = [
-            ("disabled", Disabled),
-            ("zstd", Zstd { level: None }),
-            ("zstd(18)", Zstd { level: Some(18) }),
-            ("zstd(-3)", Zstd { level: Some(-3) }),
-        ];
-
-        for (display, expected) in cases {
-            assert_eq!(
-                ImageCompressionAlgorithm::from_str(display).unwrap(),
-                expected,
-                "parsing works"
-            );
-            assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip");
-
-            let ser = serde_json::to_string(&expected).expect("serialization");
-            assert_eq!(
-                serde_json::from_str::<ImageCompressionAlgorithm>(&ser).unwrap(),
-                expected,
-                "serde roundtrip"
-            );
-
-            assert_eq!(
-                serde_json::Value::String(display.to_string()),
-                serde_json::to_value(expected).unwrap(),
-                "Display is the serde serialization"
-            );
-        }
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
+            Disabled
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
+            Zstd { level: None }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
+            Zstd { level: Some(18) }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
+            Zstd { level: Some(-3) }
+        );
    }
 }
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
    pub max_shard_count: u32,

    /// Cached result of [`Self::score`]
-    pub utilization_score: Option<u64>,
+    pub utilization_score: u64,

    /// When was this snapshot captured, pageserver local time.
    ///
@@ -50,8 +50,6 @@ fn unity_percent() -> Percent {
    Percent::new(0).unwrap()
 }

-pub type RawScore = u64;
-
 impl PageserverUtilization {
    const UTILIZATION_FULL: u64 = 1000000;

@@ -64,7 +62,7 @@ impl PageserverUtilization {
    /// - Negative values are forbidden
    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
    ///   layer eviction.
-    pub fn score(&self) -> RawScore {
+    pub fn score(&self) -> u64 {
        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
            * self.disk_usable_pct.get() as u64)
            / 100;
@@ -76,41 +74,8 @@ impl PageserverUtilization {
        std::cmp::max(disk_utilization_score, shard_utilization_score)
    }

-    pub fn cached_score(&mut self) -> RawScore {
-        match self.utilization_score {
-            None => {
-                let s = self.score();
-                self.utilization_score = Some(s);
-                s
-            }
-            Some(s) => s,
-        }
-    }
-
-    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
-    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
-    ///
-    /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
-    /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
-    pub fn is_overloaded(score: RawScore) -> bool {
-        // Why the factor of two?  This is unscientific but reflects behavior of real systems:
-        // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
-        //   startup and housekeeping jobs nice and responsive.  We can go to double this limit if needed
-        //   until some more nodes are deployed.
-        // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
-        //   hold its biggest timeline fully on disk, which is tends to be an over estimate when
-        //   some tenants are very idle and have dropped layers from disk.  In practice going up to
-        //   double is generally better than giving up and scheduling in a sub-optimal AZ.
-        score >= 2 * Self::UTILIZATION_FULL
-    }
-
-    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
-        if self.shard_count < shard_count {
-            self.shard_count = shard_count;
-
-            // Dirty cache: this will be calculated next time someone retrives the score
-            self.utilization_score = None;
-        }
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
    }

    /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -123,38 +88,7 @@ impl PageserverUtilization {
            disk_usable_pct: Percent::new(100).unwrap(),
            shard_count: 1,
            max_shard_count: 1,
-            utilization_score: Some(Self::UTILIZATION_FULL),
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
-}
-
-/// Test helper
-pub mod test_utilization {
-    use super::PageserverUtilization;
-    use std::time::SystemTime;
-    use utils::{
-        serde_percent::Percent,
-        serde_system_time::{self},
-    };
-
-    // Parameters of the imaginary node used for test utilization instances
-    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
-    const TEST_SHARDS_MAX: u32 = 1000;
-
-    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
-    /// not abuse this function from non-test code.
-    ///
-    /// Emulates a node with a 1000 shard limit and a 1TB disk.
-    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
-        PageserverUtilization {
-            disk_usage_bytes: disk_wanted_bytes,
-            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
-            disk_wanted_bytes,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count,
-            max_shard_count: TEST_SHARDS_MAX,
-            utilization_score: None,
+            utilization_score: Self::UTILIZATION_FULL,
            captured_at: serde_system_time::SystemTime(SystemTime::now()),
        }
    }
@@ -186,7 +120,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            disk_wanted_bytes: u64::MAX,
-            utilization_score: Some(13),
+            utilization_score: 13,
            disk_usable_pct: Percent::new(90).unwrap(),
            shard_count: 100,
            max_shard_count: 200,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -69,10 +69,8 @@ impl QueryError {
 }

 /// Returns true if the given error is a normal consequence of a network issue,
-/// or the client closing the connection.
-///
-/// These errors can happen during normal operations,
-/// and don't indicate a bug in our code.
+/// or the client closing the connection. These errors can happen during normal
+/// operations, and don't indicate a bug in our code.
 pub fn is_expected_io_error(e: &io::Error) -> bool {
    use io::ErrorKind::*;
    matches!(
@@ -81,16 +79,17 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
    )
 }

+#[async_trait::async_trait]
 pub trait Handler<IO> {
    /// Handle single query.
    /// postgres_backend will issue ReadyForQuery after calling this (this
    /// might be not what we want after CopyData streaming, but currently we don't
    /// care). It will also flush out the output buffer.
-    fn process_query(
+    async fn process_query(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
-    ) -> impl Future<Output = Result<(), QueryError>>;
+    ) -> Result<(), QueryError>;

    /// Called on startup packet receival, allows to process params.
    ///
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -23,6 +23,7 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {

 struct TestHandler {}

+#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
    // return single col 'hey' for any query
    async fn process_query(
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -7,7 +7,6 @@ use std::fmt;
 use url::Host;

 /// Parses a string of format either `host:port` or `host` into a corresponding pair.
-///
 /// The `host` part should be a correct `url::Host`, while `port` (if present) should be
 /// a valid decimal u16 of digits only.
 pub fn parse_host_port<S: AsRef<str>>(host_port: S) -> Result<(Host, Option<u16>), anyhow::Error> {
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -14,7 +14,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
    fn include_file(&self, filename: &str) {
        // This does the equivalent of passing bindgen::CargoCallbacks
        // to the builder .parse_callbacks() method.
-        let cargo_callbacks = bindgen::CargoCallbacks::new();
+        let cargo_callbacks = bindgen::CargoCallbacks;
        cargo_callbacks.include_file(filename)
    }

@@ -121,7 +121,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("XLogPageHeaderData")
            .allowlist_type("XLogLongPageHeaderData")
            .allowlist_var("XLOG_PAGE_MAGIC")
-            .allowlist_var("PG_MAJORVERSION_NUM")
            .allowlist_var("PG_CONTROL_FILE_SIZE")
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -44,9 +44,6 @@ macro_rules! postgres_ffi {
            // Re-export some symbols from bindings
            pub use bindings::DBState_DB_SHUTDOWNED;
            pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
-
-            pub const ZERO_CHECKPOINT: bytes::Bytes =
-                bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
        }
    };
 }
@@ -109,107 +106,6 @@ macro_rules! dispatch_pgversion {
    };
 }

-#[macro_export]
-macro_rules! enum_pgversion_dispatch {
-    ($name:expr, $typ:ident, $bind:ident, $code:block) => {
-        enum_pgversion_dispatch!(
-            name = $name,
-            bind = $bind,
-            typ = $typ,
-            code = $code,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-            ]
-        )
-    };
-    (name = $name:expr,
-     bind = $bind:ident,
-     typ = $typ:ident,
-     code = $code:block,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
-        match $name {
-            $(
-            self::$typ::$variant($bind) => {
-                use $crate::$md as pgv;
-                $code
-            }
-            ),+,
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! enum_pgversion {
-    {$name:ident, pgv :: $t:ident} => {
-        enum_pgversion!{
-            name = $name,
-            typ = $t,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-            ]
-        }
-    };
-    {$name:ident, pgv :: $p:ident :: $t:ident} => {
-        enum_pgversion!{
-            name = $name,
-            path = $p,
-            typ = $t,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-            ]
-        }
-    };
-    {name = $name:ident,
-     typ = $t:ident,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
-        pub enum $name {
-            $($variant ( $crate::$md::$t )),+
-        }
-        impl self::$name {
-            pub fn pg_version(&self) -> u32 {
-                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
-                })
-            }
-        }
-        $(
-        impl Into<self::$name> for $crate::$md::$t {
-            fn into(self) -> self::$name {
-                self::$name::$variant (self)
-            }
-        }
-        )+
-    };
-    {name = $name:ident,
-     path = $p:ident,
-     typ = $t:ident,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
-        pub enum $name {
-            $($variant ($crate::$md::$p::$t)),+
-        }
-        impl $name {
-            pub fn pg_version(&self) -> u32 {
-                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
-                })
-            }
-        }
-        $(
-        impl Into<$name> for $crate::$md::$p::$t {
-            fn into(self) -> $name {
-                $name::$variant (self)
-            }
-        }
-        )+
-    };
-}
-
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -240,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
+pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
-pub use v14::xlog_utils::try_from_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;

 pub use v14::bindings::DBState_DB_SHUTDOWNED;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -135,8 +135,6 @@ pub fn get_current_timestamp() -> TimestampTz {
 mod timestamp_conversions {
    use std::time::Duration;

-    use anyhow::Context;
-
    use super::*;

    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
@@ -156,18 +154,18 @@ mod timestamp_conversions {
        }
    }

-    pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
+    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
        let time: u64 = time
            .try_into()
-            .context("timestamp before millenium (postgres epoch)")?;
+            .expect("timestamp before millenium (postgres epoch)");
        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
        SystemTime::UNIX_EPOCH
            .checked_add(Duration::from_micros(since_unix_epoch))
-            .context("SystemTime overflow")
+            .expect("SystemTime overflow")
    }
 }

-pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
+pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};

 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
@@ -547,14 +545,14 @@ mod tests {
    #[test]
    fn test_ts_conversion() {
        let now = SystemTime::now();
-        let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
+        let round_trip = from_pg_timestamp(to_pg_timestamp(now));

        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());

        let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
+        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));

        assert_eq!(now_pg, round_trip_pg);
    }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -185,7 +185,7 @@ mod tests {
    use super::*;

    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
-        let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
+        let toml = input.parse::<toml_edit::Document>().unwrap();
        RemoteStorageConfig::from_toml(toml.as_item())
    }

@@ -235,31 +235,6 @@ timeout = '5s'";
        );
    }

-    #[test]
-    fn test_storage_class_serde_roundtrip() {
-        let classes = [
-            None,
-            Some(StorageClass::Standard),
-            Some(StorageClass::IntelligentTiering),
-        ];
-        for class in classes {
-            #[derive(Serialize, Deserialize)]
-            struct Wrapper {
-                #[serde(
-                    deserialize_with = "deserialize_storage_class",
-                    serialize_with = "serialize_storage_class"
-                )]
-                class: Option<StorageClass>,
-            }
-            let wrapped = Wrapper {
-                class: class.clone(),
-            };
-            let serialized = serde_json::to_string(&wrapped).unwrap();
-            let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap();
-            assert_eq!(class, deserialized.class);
-        }
-    }
-
    #[test]
    fn test_azure_parsing() {
        let toml = "\
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -45,8 +45,6 @@ pub use azure_core::Etag;

 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

-/// Default concurrency limit for S3 operations
-///
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -302,9 +300,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<(), TimeTravelError>;
 }

-/// Data part of an ongoing [`Download`].
-///
-/// `DownloadStream` is sensitive to the timeout and cancellation used with the original
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
 /// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
 /// with `tokio::io::copy_buf`.
 // This has 'static because safekeepers do not use cancellation tokens (yet)
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -60,16 +60,3 @@ pub struct TimelineCopyRequest {
    pub target_timeline_id: TimelineId,
    pub until_lsn: Lsn,
 }
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpRequest {
-    /// bump to
-    pub term: Option<u64>,
-}
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TimelineTermBumpResponse {
-    // before the request
-    pub previous_term: u64,
-    pub current_term: u64,
-}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -5,10 +5,9 @@
 mod calculation;
 pub mod svg;

-/// StorageModel is the input to the synthetic size calculation.
-///
-/// It represents a tree of timelines, with just the information that's needed
-/// for the calculation. This doesn't track timeline names or where each timeline
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
 /// begins and ends, for example. Instead, it consists of "points of interest"
 /// on the timelines. A point of interest could be the timeline start or end point,
 /// the oldest point on a timeline that needs to be retained because of PITR
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -14,6 +14,7 @@ testing = ["fail/failpoints"]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
+async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
@@ -25,6 +26,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -5,10 +5,8 @@ use std::{

 use metrics::IntCounter;

-/// Circuit breakers are for operations that are expensive and fallible.
-///
-/// If a circuit breaker fails repeatedly, we will stop attempting it for some
-/// period of time, to avoid denial-of-service from retries, and
+/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
+/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
 /// to mitigate the log spam from repeated failures.
 pub struct CircuitBreaker {
    /// An identifier that enables us to log useful errors when a circuit is broken
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,4 +1,3 @@
-use std::os::fd::AsRawFd;
 use std::{
    borrow::Cow,
    fs::{self, File},
@@ -204,27 +203,6 @@ pub fn overwrite(
    Ok(())
 }

-/// Syncs the filesystem for the given file descriptor.
-#[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
-    // Linux guarantees durability for syncfs.
-    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-    #[cfg(target_os = "linux")]
-    {
-        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
-    }
-    #[cfg(target_os = "macos")]
-    {
-        // macOS is not a production platform for Neon, don't even bother.
-    }
-    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-    {
-        compile_error!("Unsupported OS");
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -249,10 +249,8 @@ macro_rules! id_newtype {
    };
 }

-/// Neon timeline ID.
-///
-/// They are different from PostgreSQL timeline
-/// IDs, but serve a similar purpose: they differentiate
+/// Neon timeline IDs are different from PostgreSQL timeline
+/// IDs. They serve a similar purpose though: they differentiate
 /// between different "histories" of the same cluster.  However,
 /// PostgreSQL timeline IDs are a bit cumbersome, because they are only
 /// 32-bits wide, and they must be in ascending order in any given
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -1,280 +0,0 @@
-//! This module implements the Generic Cell Rate Algorithm for a simplified
-//! version of the Leaky Bucket rate limiting system.
-//!
-//! # Leaky Bucket
-//!
-//! If the bucket is full, no new requests are allowed and are throttled/errored.
-//! If the bucket is partially full/empty, new requests are added to the bucket in
-//! terms of "tokens".
-//!
-//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
-//!
-//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
-//!
-//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
-//!
-//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
-//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
-//!
-//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
-//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
-//!
-//! Another explaination can be found here: <https://brandur.org/rate-limiting>
-
-use std::{sync::Mutex, time::Duration};
-
-use tokio::{sync::Notify, time::Instant};
-
-pub struct LeakyBucketConfig {
-    /// This is the "time cost" of a single request unit.
-    /// Should loosely represent how long it takes to handle a request unit in active resource time.
-    /// Loosely speaking this is the inverse of the steady-rate requests-per-second
-    pub cost: Duration,
-
-    /// total size of the bucket
-    pub bucket_width: Duration,
-}
-
-impl LeakyBucketConfig {
-    pub fn new(rps: f64, bucket_size: f64) -> Self {
-        let cost = Duration::from_secs_f64(rps.recip());
-        let bucket_width = cost.mul_f64(bucket_size);
-        Self { cost, bucket_width }
-    }
-}
-
-pub struct LeakyBucketState {
-    /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
-    ///
-    /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
-    /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
-    /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
-    /// Draining the bucket will happen naturally as `now` moves forward.
-    ///
-    /// Let `n` be some "time cost" for the request,
-    /// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
-    /// If now is within the `bucket window + n`, we are within time budget.
-    /// If now is before the `bucket window + n`, we have run out of budget.
-    ///
-    /// This is inspired by the generic cell rate algorithm (GCRA) and works
-    /// exactly the same as a leaky-bucket.
-    pub empty_at: Instant,
-}
-
-impl LeakyBucketState {
-    pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
-        LeakyBucketState {
-            empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
-        }
-    }
-
-    pub fn bucket_is_empty(&self, now: Instant) -> bool {
-        // if self.end is after now, the bucket is not empty
-        self.empty_at <= now
-    }
-
-    /// Immediately adds tokens to the bucket, if there is space.
-    ///
-    /// In a scenario where you are waiting for available rate,
-    /// rather than just erroring immediately, `started` corresponds to when this waiting started.
-    ///
-    /// `n` is the number of tokens that will be filled in the bucket.
-    ///
-    /// # Errors
-    ///
-    /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
-    /// there will be space again.
-    pub fn add_tokens(
-        &mut self,
-        config: &LeakyBucketConfig,
-        started: Instant,
-        n: f64,
-    ) -> Result<(), Instant> {
-        let now = Instant::now();
-
-        // invariant: started <= now
-        debug_assert!(started <= now);
-
-        // If the bucket was empty when we started our search,
-        // we should update the `empty_at` value accordingly.
-        // this prevents us from having negative tokens in the bucket.
-        let mut empty_at = self.empty_at;
-        if empty_at < started {
-            empty_at = started;
-        }
-
-        let n = config.cost.mul_f64(n);
-        let new_empty_at = empty_at + n;
-        let allow_at = new_empty_at.checked_sub(config.bucket_width);
-
-        //                     empty_at
-        //          allow_at    |   new_empty_at
-        //           /          |   /
-        // -------o-[---------o-|--]---------
-        //   now1 ^      now2 ^
-        //
-        // at now1, the bucket would be completely filled if we add n tokens.
-        // at now2, the bucket would be partially filled if we add n tokens.
-
-        match allow_at {
-            Some(allow_at) if now < allow_at => Err(allow_at),
-            _ => {
-                self.empty_at = new_empty_at;
-                Ok(())
-            }
-        }
-    }
-}
-
-pub struct RateLimiter {
-    pub config: LeakyBucketConfig,
-    pub state: Mutex<LeakyBucketState>,
-    /// a queue to provide this fair ordering.
-    pub queue: Notify,
-}
-
-struct Requeue<'a>(&'a Notify);
-
-impl Drop for Requeue<'_> {
-    fn drop(&mut self) {
-        self.0.notify_one();
-    }
-}
-
-impl RateLimiter {
-    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
-        RateLimiter {
-            state: Mutex::new(LeakyBucketState::with_initial_tokens(
-                &config,
-                initial_tokens,
-            )),
-            config,
-            queue: {
-                let queue = Notify::new();
-                queue.notify_one();
-                queue
-            },
-        }
-    }
-
-    pub fn steady_rps(&self) -> f64 {
-        self.config.cost.as_secs_f64().recip()
-    }
-
-    /// returns true if we did throttle
-    pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
-        let start = tokio::time::Instant::now();
-
-        // wait until we are the first in the queue
-        let mut notified = std::pin::pin!(self.queue.notified());
-        if !notified.as_mut().enable() {
-            throttled = true;
-            notified.await;
-        }
-
-        // notify the next waiter in the queue when we are done.
-        let _guard = Requeue(&self.queue);
-
-        loop {
-            let res = self
-                .state
-                .lock()
-                .unwrap()
-                .add_tokens(&self.config, start, count as f64);
-            match res {
-                Ok(()) => return throttled,
-                Err(ready_at) => {
-                    throttled = true;
-                    tokio::time::sleep_until(ready_at).await;
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use tokio::time::Instant;
-
-    use super::{LeakyBucketConfig, LeakyBucketState};
-
-    #[tokio::test(start_paused = true)]
-    async fn check() {
-        let config = LeakyBucketConfig {
-            // average 100rps
-            cost: Duration::from_millis(10),
-            // burst up to 100 requests
-            bucket_width: Duration::from_millis(1000),
-        };
-
-        let mut state = LeakyBucketState {
-            empty_at: Instant::now(),
-        };
-
-        // supports burst
-        {
-            // should work for 100 requests this instant
-            for _ in 0..100 {
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-
-        // doesn't overfill
-        {
-            // after 1s we should have an empty bucket again.
-            tokio::time::advance(Duration::from_secs(1)).await;
-            assert!(state.bucket_is_empty(Instant::now()));
-
-            // after 1s more, we should not over count the tokens and allow more than 200 requests.
-            tokio::time::advance(Duration::from_secs(1)).await;
-            for _ in 0..100 {
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-
-        // supports sustained rate over a long period
-        {
-            tokio::time::advance(Duration::from_secs(1)).await;
-
-            // should sustain 100rps
-            for _ in 0..2000 {
-                tokio::time::advance(Duration::from_millis(10)).await;
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-        }
-
-        // supports requesting more tokens than can be stored in the bucket
-        // we just wait a little bit longer upfront.
-        {
-            // start the bucket completely empty
-            tokio::time::advance(Duration::from_secs(5)).await;
-            assert!(state.bucket_is_empty(Instant::now()));
-
-            // requesting 200 tokens of space should take 200*cost = 2s
-            // but we already have 1s available, so we wait 1s from start.
-            let start = Instant::now();
-
-            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_secs(1));
-
-            tokio::time::advance(Duration::from_millis(500)).await;
-            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(500));
-
-            tokio::time::advance(Duration::from_millis(500)).await;
-            state.add_tokens(&config, start, 200.0).unwrap();
-
-            // bucket should be completely full now
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -71,7 +71,6 @@ pub mod postgres_client;

 pub mod tracing_span_assert;

-pub mod leaky_bucket;
 pub mod rate_limit;

 /// Simple once-barrier and a guard which keeps barrier awaiting.
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -100,9 +100,7 @@ pub enum LockFileRead {
 }

 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
-/// inspect its content.
-///
-/// It is not an `Err(...)` if the file does not exist or is already locked.
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,9 +3,9 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, VariantNames};
+use strum_macros::{EnumString, EnumVariantNames};

-#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
    Plain,
@@ -188,7 +188,7 @@ impl Drop for TracingPanicHookGuard {
 }

 /// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicHookInfo) {
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
    // following rust 1.66.1 std implementation:
    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
    let location = info.location();
@@ -274,14 +274,6 @@ impl From<String> for SecretString {
    }
 }

-impl FromStr for SecretString {
-    type Err = std::convert::Infallible;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        Ok(Self(s.to_string()))
-    }
-}
-
 impl std::fmt::Debug for SecretString {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[SECRET]")
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -8,7 +8,6 @@ use tracing::{trace, warn};
 use crate::lsn::Lsn;

 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
-///
 /// Serialized in custom flexible key/value format. In replication protocol, it
 /// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
 /// Standby status update / Hot standby feedback messages.
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -65,8 +65,6 @@ impl<T> Poison<T> {
    }
 }

-/// Armed pointer to a [`Poison`].
-///
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,15 +5,6 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
    last: Option<Instant>,
    interval: Duration,
-    dropped: u64,
-}
-
-pub struct RateLimitStats(u64);
-
-impl std::fmt::Display for RateLimitStats {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{} dropped calls", self.0)
-    }
 }

 impl RateLimit {
@@ -21,27 +12,20 @@ impl RateLimit {
        Self {
            last: None,
            interval,
-            dropped: 0,
        }
    }

    /// Call `f` if the rate limit allows.
    /// Don't call it otherwise.
    pub fn call<F: FnOnce()>(&mut self, f: F) {
-        self.call2(|_| f())
-    }
-
-    pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
        let now = Instant::now();
        match self.last {
            Some(last) if now - last <= self.interval => {
                // ratelimit
-                self.dropped += 1;
            }
            _ => {
                self.last = Some(now);
-                f(RateLimitStats(self.dropped));
-                self.dropped = 0;
+                f();
            }
        }
    }
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -13,11 +13,10 @@ pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);

-/// Combination of ShardNumber and ShardCount.
-///
-/// For use within the context of a particular tenant, when we need to know which shard we're
-/// dealing with, but do not need to know the full ShardIdentity (because we won't be doing
-/// any page->shard mapping), and do not need to know the fully qualified TenantShardId.
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -49,11 +49,12 @@ use std::sync::{RwLock, RwLockWriteGuard};

 use tokio::sync::watch;

-/// Rcu allows multiple readers to read and hold onto a value without blocking
-/// (for very long).
 ///
-/// Storing to the Rcu updates the value, making new readers immediately see
-/// the new value, but it also waits for all current readers to finish.
+/// Rcu allows multiple readers to read and hold onto a value without blocking
+/// (for very long).  Storing to the Rcu updates the value, making new readers
+/// immediately see the new value, but it also waits for all current readers to
+/// finish.
+///
 pub struct Rcu<V> {
    inner: RwLock<RcuInner<V>>,
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -5,9 +5,7 @@ use std::sync::{
 use tokio::sync::Semaphore;

 /// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`.
-///
-/// Allows use of `take` which does not require holding an outer mutex guard
+/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
 /// for the duration of initialization.
 ///
 /// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
    T: serde::de::DeserializeOwned,
 {
-    let document: toml_edit::DocumentMut = match item {
+    let document: toml_edit::Document = match item {
        toml_edit::Item::Table(toml) => toml.clone().into(),
        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
            toml.clone().into_table().into()
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -7,7 +7,6 @@ pub enum VecMapOrdering {
 }

 /// Ordered map datastructure implemented in a Vec.
-///
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -6,10 +6,9 @@ pub enum YieldingLoopError {
    Cancelled,
 }

-/// Helper for long synchronous loops, e.g. over all tenants in the system.
-///
-/// Periodically yields to avoid blocking the executor, and after resuming
-/// checks the provided cancellation token to drop out promptly on shutdown.
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
 #[inline(always)]
 pub async fn yielding_loop<I, T, F>(
    interval: usize,
@@ -24,7 +23,7 @@ where
    for (i, item) in iter.enumerate() {
        visitor(item);

-        if (i + 1) % interval == 0 {
+        if i + 1 % interval == 0 {
            tokio::task::yield_now().await;
            if cancel.is_cancelled() {
                return Err(YieldingLoopError::Cancelled);
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -4,6 +4,7 @@
 use std::{env, path::PathBuf, process::Command};

 use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;

 fn main() -> anyhow::Result<()> {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
@@ -63,25 +64,16 @@ fn main() -> anyhow::Result<()> {
            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
    };

-    let unwind_abi_functions = [
-        "log_internal",
-        "recovery_download",
-        "start_streaming",
-        "finish_sync_safekeepers",
-        "wait_event_set",
-        "WalProposerStart",
-    ];
-
    // The bindgen::Builder is the main entry point
    // to bindgen, and lets you build up options for
    // the resulting bindings.
-    let mut builder = bindgen::Builder::default()
+    let bindings = bindgen::Builder::default()
        // The input header we would like to generate
        // bindings for.
        .header("bindgen_deps.h")
        // Tell cargo to invalidate the built crate whenever any of the
        // included header files changed.
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .parse_callbacks(Box::new(CargoCallbacks))
        .allowlist_type("WalProposer")
        .allowlist_type("WalProposerConfig")
        .allowlist_type("walproposer_api")
@@ -113,12 +105,7 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("WL_SOCKET_MASK")
        .clang_arg("-DWALPROPOSER_LIB")
        .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"));
-
-    for name in unwind_abi_functions {
-        builder = builder.override_abi(bindgen::Abi::CUnwind, name);
-    }
-    let bindings = builder
+        .clang_arg(format!("-I{inc_server_path}"))
        // Finish the builder and generate the bindings.
        .generate()
        // Unwrap the Result and panic on failure.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -33,7 +33,7 @@ extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemStat
    }
 }

-extern "C-unwind" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -187,7 +187,7 @@ extern "C" fn conn_blocking_write(
    }
 }

-extern "C-unwind" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -272,7 +272,7 @@ extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
    }
 }

-extern "C-unwind" fn wait_event_set(
+extern "C" fn wait_event_set(
    wp: *mut WalProposer,
    timeout: ::std::os::raw::c_long,
    event_sk: *mut *mut Safekeeper,
@@ -324,7 +324,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
    }
 }

-extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
@@ -340,7 +340,7 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekee
    }
 }

-extern "C-unwind" fn log_internal(
+extern "C" fn log_internal(
    wp: *mut WalProposer,
    level: ::std::os::raw::c_int,
    line: *const ::std::os::raw::c_char,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing" ]
+testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
@@ -16,7 +16,6 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
-bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
@@ -37,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -52,7 +52,6 @@ rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
-send-future.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
@@ -101,7 +100,6 @@ procfs.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
-indoc.workspace = true

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -103,13 +103,13 @@ async fn ingest(
        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
        if batch.len() >= BATCH_SIZE {
            let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch).unwrap();
+            let serialized = SerializedBatch::from_values(this_batch);
            layer.put_batch(serialized, &ctx).await?;
        }
    }
    if !batch.is_empty() {
        let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch).unwrap();
+        let serialized = SerializedBatch::from_values(this_batch);
        layer.put_batch(serialized, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;
@@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
    page_cache::init(conf.page_cache_size);

    {
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
+async-trait.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1,20 +1,2 @@
 pub mod mgmt_api;
 pub mod page_service;
-
-/// For timeline_block_unblock_gc, distinguish the two different operations. This could be a bool.
-// If file structure is per-kind not per-feature then where to put this?
-#[derive(Clone, Copy)]
-pub enum BlockUnblock {
-    Block,
-    Unblock,
-}
-
-impl std::fmt::Display for BlockUnblock {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let s = match self {
-            BlockUnblock::Block => "block",
-            BlockUnblock::Unblock => "unblock",
-        };
-        f.write_str(s)
-    }
-}
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -12,8 +12,6 @@ use utils::{

 pub use reqwest::Body as ReqwestBody;

-use crate::BlockUnblock;
-
 pub mod util;

 #[derive(Debug, Clone)]
@@ -421,24 +419,6 @@ impl Client {
        }
    }

-    pub async fn timeline_archival_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        req: &TimelineArchivalConfigRequest,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
-            self.mgmt_api_endpoint
-        );
-
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn timeline_detach_ancestor(
        &self,
        tenant_shard_id: TenantShardId,
@@ -456,20 +436,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn timeline_block_unblock_gc(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        dir: BlockUnblock,
-    ) -> Result<()> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/{dir}_gc",
-            self.mgmt_api_endpoint,
-        );
-
-        self.request(Method::POST, &uri, ()).await.map(|_| ())
-    }
-
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
@@ -540,16 +506,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    /// Configs io buffer alignment at runtime.
-    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
-        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, align)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -144,11 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
@@ -89,7 +89,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
-        assert!(k.is_i128_representable(), "invalid key: ");
    }
    // TODO(chi): special handling for last key?
    Ok(())
@@ -190,11 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -26,7 +26,7 @@ use pageserver::{
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
    virtual_file,
 };
-use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                println!("specified prefix '{}' failed validation", cmd.prefix);
                return Ok(());
            };
-            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
@@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -58,11 +58,6 @@ pub(crate) struct Args {
    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
    #[clap(long)]
    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
-
-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
-    #[clap(long)]
-    set_io_alignment: Option<usize>,
-
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -129,10 +124,6 @@ async fn main_impl(
        mgmt_api_client.put_io_engine(engine_str).await?;
    }

-    if let Some(align) = args.set_io_alignment {
-        mgmt_api_client.put_io_alignment(align).await?;
-    }
-
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/assert_u64_eq_usize.rs
+++ b/pageserver/src/assert_u64_eq_usize.rs
@@ -1,39 +0,0 @@
-//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
-
-pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
-    if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
-        panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
-    }
-};
-
-pub(crate) trait U64IsUsize {
-    fn into_usize(self) -> usize;
-}
-
-impl U64IsUsize for u64 {
-    #[inline(always)]
-    fn into_usize(self) -> usize {
-        #[allow(clippy::let_unit_value)]
-        let _ = _ASSERT_U64_EQ_USIZE;
-        self as usize
-    }
-}
-
-pub(crate) trait UsizeIsU64 {
-    fn into_u64(self) -> u64;
-}
-
-impl UsizeIsU64 for usize {
-    #[inline(always)]
-    fn into_u64(self) -> u64 {
-        #[allow(clippy::let_unit_value)]
-        let _ = _ASSERT_U64_EQ_USIZE;
-        self as u64
-    }
-}
-
-pub const fn u64_to_usize(x: u64) -> usize {
-    #[allow(clippy::let_unit_value)]
-    let _ = _ASSERT_U64_EQ_USIZE;
-    x as usize
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -5,7 +5,6 @@
 use std::env;
 use std::env::{var, VarError};
 use std::io::Read;
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

@@ -37,7 +36,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::crashsafe::syncfs;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::{
@@ -126,53 +124,19 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
+    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

-    // The tenants directory contains all the pageserver local disk state.
-    // Create if not exists and make sure all the contents are durable before proceeding.
-    // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
-    // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
-    // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
    let tenants_path = conf.tenants_path();
-    {
-        let open = || {
-            nix::dir::Dir::open(
-                tenants_path.as_std_path(),
-                nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
-                nix::sys::stat::Mode::empty(),
-            )
-        };
-        let dirfd = match open() {
-            Ok(dirfd) => dirfd,
-            Err(e) => match e {
-                nix::errno::Errno::ENOENT => {
-                    utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
-                        format!("Failed to create tenants root dir at '{tenants_path}'")
-                    })?;
-                    open().context("open tenants dir after creating it")?
-                }
-                e => anyhow::bail!(e),
-            },
-        };
-
-        let started = Instant::now();
-        syncfs(dirfd)?;
-        let elapsed = started.elapsed();
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "made tenant directory contents durable"
-        );
+    if !tenants_path.exists() {
+        utils::crashsafe::create_dir_all(conf.tenants_path())
+            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
    }

    // Initialize up failpoints support
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.io_buffer_alignment,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
@@ -208,15 +172,27 @@ fn initialize_config(
        }
    };

-    let config_file_contents =
-        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
-    let config_toml = serde_path_to_error::deserialize(
-        toml_edit::de::Deserializer::from_str(&config_file_contents)
-            .context("build toml deserializer")?,
-    )
-    .context("deserialize config toml")?;
-    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
-        .context("runtime-validation of config toml")?;
+    let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
+        Ok(mut f) => {
+            let md = f.metadata().context("stat config file")?;
+            if md.is_file() {
+                let mut s = String::new();
+                f.read_to_string(&mut s).context("read config file")?;
+                s.parse().context("parse config file toml")?
+            } else {
+                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
+            }
+        }
+        Err(e) => {
+            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
+        }
+    };
+
+    debug!("Using pageserver toml: {config}");
+
+    // Construct the runtime representation
+    let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
+        .context("Failed to parse pageserver configuration")?;

    Ok(Box::leak(Box::new(conf)))
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,8 +1,6 @@
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
 use crate::config::PageServerConf;
-use crate::consumption_metrics::metrics::MetricsKey;
-use crate::consumption_metrics::upload::KeyGen as _;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::size::CalculateSyntheticSizeError;
@@ -10,7 +8,6 @@ use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
-use itertools::Itertools as _;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
@@ -22,8 +19,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;

-mod disk_cache;
 mod metrics;
+use crate::consumption_metrics::metrics::MetricsKey;
+mod disk_cache;
 mod upload;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
@@ -145,12 +143,6 @@ async fn collect_metrics(
        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;

-        // Pre-generate event idempotency keys, to reuse them across the bucket
-        // and HTTP sinks.
-        let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
-            .take(metrics.len())
-            .collect_vec();
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -169,14 +161,8 @@ async fn collect_metrics(
            }

            if let Some(bucket_client) = &bucket_client {
-                let res = upload::upload_metrics_bucket(
-                    bucket_client,
-                    &cancel,
-                    &node_id,
-                    &metrics,
-                    &idempotency_keys,
-                )
-                .await;
+                let res =
+                    upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
                if let Err(e) = res {
                    tracing::error!("failed to upload to S3: {e:#}");
                }
@@ -188,9 +174,9 @@ async fn collect_metrics(
                &client,
                metric_collection_endpoint,
                &cancel,
+                &node_id,
                &metrics,
                &mut cached_metrics,
-                &idempotency_keys,
            )
            .await;
            if let Err(e) = res {
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
    client: &reqwest::Client,
    metric_collection_endpoint: &reqwest::Url,
    cancel: &CancellationToken,
+    node_id: &str,
    metrics: &[RawMetric],
    cached_metrics: &mut Cache,
-    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
    let mut uploaded = 0;
    let mut failed = 0;

    let started_at = std::time::Instant::now();

-    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);

    while let Some(res) = iter.next() {
        let (chunk, body) = res?;
@@ -87,7 +87,6 @@ pub(super) async fn upload_metrics_bucket(
    cancel: &CancellationToken,
    node_id: &str,
    metrics: &[RawMetric],
-    idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
    if metrics.is_empty() {
        // Skip uploads if we have no metrics, so that readers don't have to handle the edge case
@@ -107,7 +106,7 @@ pub(super) async fn upload_metrics_bucket(

    // Serialize and write into compressed buffer
    let started_at = std::time::Instant::now();
-    for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
+    for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
        let (_chunk, body) = res?;
        gzip_writer.write_all(&body).await?;
    }
@@ -135,31 +134,29 @@ pub(super) async fn upload_metrics_bucket(
    Ok(())
 }

-/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
-/// idempotency keys are injected into the corresponding metric events (reused
-/// across different metrics sinks), and must have the same length as input.
-fn serialize_in_chunks<'a>(
+// The return type is quite ugly, but we gain testability in isolation
+fn serialize_in_chunks<'a, F>(
    chunk_size: usize,
    input: &'a [RawMetric],
-    idempotency_keys: &'a [IdempotencyKey<'a>],
+    factory: F,
 ) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+where
+    F: KeyGen<'a> + 'a,
 {
    use bytes::BufMut;

-    assert_eq!(input.len(), idempotency_keys.len());
-
-    struct Iter<'a> {
+    struct Iter<'a, F> {
        inner: std::slice::Chunks<'a, RawMetric>,
-        idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
        chunk_size: usize,

        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
        buffer: bytes::BytesMut,
        // chunk amount of events are reused to produce the serialized document
        scratch: Vec<Event<Ids, Name>>,
+        factory: F,
    }

-    impl<'a> Iterator for Iter<'a> {
+    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;

        fn next(&mut self) -> Option<Self::Item> {
@@ -170,14 +167,17 @@ fn serialize_in_chunks<'a>(
                self.scratch.extend(
                    chunk
                        .iter()
-                        .zip(&mut self.idempotency_keys)
-                        .map(|(raw_metric, key)| raw_metric.as_event(key)),
+                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
                );
            } else {
                // next rounds: update_in_place to reuse allocations
                assert_eq!(self.scratch.len(), self.chunk_size);
-                itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
-                    .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
+                self.scratch
+                    .iter_mut()
+                    .zip(chunk.iter())
+                    .for_each(|(slot, raw_metric)| {
+                        raw_metric.update_in_place(slot, &self.factory.generate())
+                    });
            }

            let res = serde_json::to_writer(
@@ -198,19 +198,18 @@ fn serialize_in_chunks<'a>(
        }
    }

-    impl<'a> ExactSizeIterator for Iter<'a> {}
+    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}

    let buffer = bytes::BytesMut::new();
    let inner = input.chunks(chunk_size);
-    let idempotency_keys = idempotency_keys.iter();
    let scratch = Vec::new();

    Iter {
        inner,
-        idempotency_keys,
        chunk_size,
        buffer,
        scratch,
+        factory,
    }
 }

@@ -269,7 +268,7 @@ impl RawMetricExt for RawMetric {
    }
 }

-pub(crate) trait KeyGen<'a> {
+trait KeyGen<'a>: Copy {
    fn generate(&self) -> IdempotencyKey<'a>;
 }

@@ -390,10 +389,7 @@ mod tests {
        let examples = metric_samples();
        assert!(examples.len() > 1);

-        let now = Utc::now();
-        let idempotency_keys = (0..examples.len())
-            .map(|i| FixedGen::new(now, "1", i as u16).generate())
-            .collect::<Vec<_>>();
+        let factory = FixedGen::new(Utc::now(), "1", 42);

        // need to use Event here because serde_json::Value uses default hashmap, not linked
        // hashmap
@@ -402,13 +398,13 @@ mod tests {
            events: Vec<Event<Ids, Name>>,
        }

-        let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
+        let correct = serialize_in_chunks(examples.len(), &examples, factory)
            .map(|res| res.unwrap().1)
            .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
            .collect::<Vec<_>>();

        for chunk_size in 1..examples.len() {
-            let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
+            let actual = serialize_in_chunks(chunk_size, &examples, factory)
                .map(|res| res.unwrap().1)
                .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
                .collect::<Vec<_>>();
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -1,9 +1,7 @@
-//! Defines [`RequestContext`].
-//!
-//! It is a structure that we use throughout the pageserver to propagate
-//! high-level context from places that _originate_ activity down to the
-//! shared code paths at the heart of the pageserver. It's inspired by
-//! Golang's `context.Context`.
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
 //!
 //! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
 //! 1. What high-level activity ([`TaskKind`]) needs this page?
@@ -107,10 +105,8 @@ pub struct RequestContext {
 #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
 pub enum PageContentKind {
    Unknown,
-    DeltaLayerSummary,
    DeltaLayerBtreeNode,
    DeltaLayerValue,
-    ImageLayerSummary,
    ImageLayerBtreeNode,
    ImageLayerValue,
    InMemoryLayer,
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,32 +141,12 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        m.other
                    );

-                    let az_id = {
-                        let az_id_from_metadata = m
-                            .other
-                            .get("availability_zone_id")
-                            .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
-
-                        match az_id_from_metadata {
-                            Some(az_id) => Some(az_id),
-                            None => {
-                                tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
-                                conf.availability_zone.clone()
-                            }
-                        }
-                    };
-
-                    if az_id.is_none() {
-                        panic!("Availablity zone id could not be inferred from metadata.json or pageserver config");
-                    }
-
                    Some(NodeRegisterRequest {
                        node_id: conf.id,
                        listen_pg_addr: m.postgres_host,
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        availability_zone_id: az_id.expect("Checked above"),
                    })
                }
                Err(e) => {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -41,15 +41,19 @@
 // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

-use std::{sync::Arc, time::SystemTime};
+use std::{
+    sync::Arc,
+    time::{Duration, SystemTime},
+};

 use anyhow::Context;
-use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::serde_percent::Percent;
 use utils::{completion, id::TimelineId};

 use crate::{
@@ -65,9 +69,23 @@ use crate::{
    CancellableTask, DiskUsageEvictionTask,
 };

+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
 /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
 /// partitioning.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "type", content = "args")]
 pub enum EvictionOrder {
    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
@@ -78,22 +96,23 @@ pub enum EvictionOrder {
        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
        /// to ensure nondeterminism by adding in a random number to break the
        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
        highest_layer_count_loses_first: bool,
    },
 }

-impl From<pageserver_api::config::EvictionOrder> for EvictionOrder {
-    fn from(value: pageserver_api::config::EvictionOrder) -> Self {
-        match value {
-            pageserver_api::config::EvictionOrder::RelativeAccessed {
-                highest_layer_count_loses_first,
-            } => Self::RelativeAccessed {
-                highest_layer_count_loses_first,
-            },
+impl Default for EvictionOrder {
+    fn default() -> Self {
+        Self::RelativeAccessed {
+            highest_layer_count_loses_first: true,
        }
    }
 }

+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
    fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
        use EvictionOrder::*;
@@ -276,7 +295,7 @@ async fn disk_usage_eviction_task_iteration(
        storage,
        usage_pre,
        tenant_manager,
-        task_config.eviction_order.into(),
+        task_config.eviction_order,
        cancel,
    )
    .await;
@@ -1238,6 +1257,7 @@ mod filesystem_level_usage {

    #[test]
    fn max_usage_pct_pressure() {
+        use super::EvictionOrder;
        use super::Usage as _;
        use std::time::Duration;
        use utils::serde_percent::Percent;
@@ -1249,7 +1269,7 @@ mod filesystem_level_usage {
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: pageserver_api::config::EvictionOrder::default(),
+                eviction_order: EvictionOrder::default(),
            },
            total_bytes: 100_000,
            avail_bytes: 0,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -318,27 +318,6 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    }
 }

-impl From<crate::tenant::TimelineArchivalError> for ApiError {
-    fn from(value: crate::tenant::TimelineArchivalError) -> Self {
-        use crate::tenant::TimelineArchivalError::*;
-        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
-            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
-            e @ HasArchivedParent(_) => {
-                ApiError::PreconditionFailed(e.to_string().into_boxed_str())
-            }
-            HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
-                format!(
-                    "Cannot archive timeline which has non-archived child timelines: {children:?}"
-                )
-                .into_boxed_str(),
-            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
-            Other(e) => ApiError::InternalServerError(e),
-        }
-    }
-}
-
 impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
        use crate::tenant::mgr::DeleteTimelineError::*;
@@ -426,8 +405,6 @@ async fn build_timeline_info_common(
    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
-    // Report is_archived = false if the timeline is still loading
-    let is_archived = timeline.is_archived().unwrap_or(false);
    let remote_consistent_lsn_projected = timeline
        .get_remote_consistent_lsn_projected()
        .unwrap_or(Lsn(0));
@@ -468,7 +445,6 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
-        is_archived: Some(is_archived),

        walreceiver_status,

@@ -710,7 +686,9 @@ async fn timeline_archival_config_handler(

        tenant
            .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await?;
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
        Ok::<_, ApiError>(())
    }
    .instrument(info_span!("timeline_archival_config",
@@ -874,10 +852,7 @@ async fn get_timestamp_of_lsn_handler(

    match result {
        Some(time) => {
-            let time = format_rfc3339(
-                postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
-            )
-            .to_string();
+            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
            json_response(StatusCode::OK, time)
        }
        None => Err(ApiError::NotFound(
@@ -1731,12 +1706,13 @@ async fn timeline_compact_handler(
        flags |= CompactFlags::ForceImageLayerCreation;
    }
    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
    }
-    if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
-        flags |= CompactFlags::DryRun;
-    }
-
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -2076,7 +2052,7 @@ async fn disk_usage_eviction_run(
        evict_bytes: u64,

        #[serde(default)]
-        eviction_order: pageserver_api::config::EvictionOrder,
+        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
    }

    #[derive(Debug, Clone, Copy, serde::Serialize)]
@@ -2112,7 +2088,7 @@ async fn disk_usage_eviction_run(
        &state.remote_storage,
        usage,
        &state.tenant_manager,
-        config.eviction_order.into(),
+        config.eviction_order,
        &cancel,
    )
    .await;
@@ -2354,20 +2330,6 @@ async fn put_io_engine_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_alignment_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let align: usize = json_request(&mut r).await?;
-    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
-        ApiError::PreconditionFailed(
-            format!("Requested io alignment ({align}) is not a power of two").into(),
-        )
-    })?;
-    json_response(StatusCode::OK, ())
-}
-
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -2980,7 +2942,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| api_handler(r, timeline_compact_handler),
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
@@ -3055,9 +3017,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put("/v1/io_alignment", |r| {
-            api_handler(r, put_io_alignment_handler)
-        })
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,7 +19,6 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
-use crate::walrecord::decode_wal_record;
 use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -311,13 +310,11 @@ async fn import_wal(

        let mut nrecords = 0;
        let mut modification = tline.begin_modification(last_lsn);
+        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
-
                walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
                WAL_INGEST.records_committed.inc();

@@ -452,12 +449,11 @@ pub async fn import_wal_from_tar(
        waldecoder.feed_bytes(&bytes[offset..]);

        let mut modification = tline.begin_modification(last_lsn);
+        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
                walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
                modification.commit(ctx).await?;
                last_lsn = lsn;
--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,7 +1,9 @@
 use std::{num::NonZeroUsize, sync::Arc};

-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
+    #[serde(rename_all = "snake_case")]
    Direct { max_concurrency: NonZeroUsize },
 }

@@ -14,16 +16,6 @@ impl Default for L0FlushConfig {
    }
 }

-impl From<pageserver_api::models::L0FlushConfig> for L0FlushConfig {
-    fn from(config: pageserver_api::models::L0FlushConfig) -> Self {
-        match config {
-            pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => {
-                Self::Direct { max_concurrency }
-            }
-        }
-    }
-}
-
 #[derive(Clone)]
 pub struct L0FlushGlobalState(Arc<Inner>);

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -16,7 +16,6 @@ pub mod l0_flush;
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
-mod assert_u64_eq_usize;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{IntoStaticStr, VariantNames};
+use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;

@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];

 // Metrics collected on operations on the storage repository.
-#[derive(Debug, VariantNames, IntoStaticStr)]
+#[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
@@ -1552,6 +1552,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
    PageStreamV2,
+    PageStream,
    Basebackup,
    Fullbackup,
    LeaseLsn,
@@ -1802,14 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_utilization_score",
-        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_secondary_heatmap_total_size",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -557,7 +557,7 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        _protocol_version: PagestreamProtocolVersion,
+        protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -601,7 +601,8 @@ impl PageServerHandler {
            fail::fail_point!("ps::handle-pagerequest-message");

            // parse request
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg =
+                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

            // invoke handler function
            let (handler_result, span) = match neon_fe_msg {
@@ -753,21 +754,16 @@ impl PageServerHandler {
        }

        if request_lsn < **latest_gc_cutoff_lsn {
-            let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
-                // The requested LSN is below gc cutoff and is not guarded by a lease.
-
-                // Check explicitly for INVALID just to get a less scary error message if the
-                // request is obviously bogus
-                return Err(if request_lsn == Lsn::INVALID {
-                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
-                } else {
-                    PageStreamError::BadRequest(format!(
+            // Check explicitly for INVALID just to get a less scary error message if the
+            // request is obviously bogus
+            return Err(if request_lsn == Lsn::INVALID {
+                PageStreamError::BadRequest("invalid LSN(0) in request".into())
+            } else {
+                PageStreamError::BadRequest(format!(
                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                        request_lsn, **latest_gc_cutoff_lsn
                    ).into())
-                });
-            }
+            });
        }

        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
@@ -794,8 +790,6 @@ impl PageServerHandler {
        }
    }

-    /// Handles the lsn lease request.
-    /// If a lease cannot be obtained, the client will receive NULL.
    #[instrument(skip_all, fields(shard_id, %lsn))]
    async fn handle_make_lsn_lease<IO>(
        &mut self,
@@ -818,25 +812,19 @@ impl PageServerHandler {
            .await?;
        set_tracing_field_shard_id(&timeline);

-        let lease = timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
-            .inspect_err(|e| {
-                warn!("{e}");
-            })
-            .ok();
-        let valid_until_str = lease.map(|l| {
-            l.valid_until
-                .duration_since(SystemTime::UNIX_EPOCH)
-                .expect("valid_until is earlier than UNIX_EPOCH")
-                .as_millis()
-                .to_string()
-        });
-        let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
+        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
+        let valid_until = lease
+            .valid_until
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map_err(|e| QueryError::Other(e.into()))?;

        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
            b"valid_until",
        )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
+        .write_message_noflush(&BeMessage::DataRow(&[Some(
+            &valid_until.as_millis().to_be_bytes(),
+        )]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

        Ok(())
    }
@@ -1199,6 +1187,7 @@ impl PageServerHandler {
    }
 }

+#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1286,6 +1275,35 @@ where
                ctx,
            )
            .await?;
+        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStream)
+                .inc();
+
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V1,
+                ctx,
+            )
+            .await?;
        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -168,9 +168,7 @@ impl Timeline {
        DatadirModification {
            tline: self,
            pending_lsns: Vec::new(),
-            pending_metadata_pages: HashMap::new(),
-            pending_data_pages: Vec::new(),
-            pending_zero_data_pages: Default::default(),
+            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
@@ -729,12 +727,8 @@ impl Timeline {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
            Some(AuxFilePolicy::V1) => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                let empty_str = if res.is_empty() { ", empty" } else { "" };
-                warn!(
-                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
-                );
-                Ok(res)
+                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
+                self.list_aux_files_v1(lsn, ctx).await
            }
            None => {
                let res = self.list_aux_files_v1(lsn, ctx).await?;
@@ -1021,10 +1015,9 @@ impl Timeline {
 }

 /// DatadirModification represents an operation to ingest an atomic set of
-/// updates to the repository.
-///
-/// It is created by the 'begin_record' function. It is called for each WAL
-/// record, so that all the modifications by a one WAL record appear atomic.
+/// updates to the repository. It is created by the 'begin_record'
+/// function. It is called for each WAL record, so that all the modifications
+/// by a one WAL record appear atomic.
 pub struct DatadirModification<'a> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
@@ -1038,24 +1031,10 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

-    /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications
-    /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'.
-    pending_metadata_pages: HashMap<CompactKey, Vec<(Lsn, usize, Value)>>,
-
-    /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
-    /// which keys are stored here.
-    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
-
-    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
-    // if we encounter a write from postgres in the same wal record, we will drop this entry.
-    //
-    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
-    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
-    pending_zero_data_pages: HashSet<CompactKey>,
-
    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
@@ -1079,10 +1058,6 @@ impl<'a> DatadirModification<'a> {
        self.pending_bytes
    }

-    pub(crate) fn has_dirty_data_pages(&self) -> bool {
-        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
-    }
-
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1091,10 +1066,6 @@ impl<'a> DatadirModification<'a> {
            lsn,
            self.lsn
        );
-
-        // If we are advancing LSN, then state from previous wal record should have been flushed.
-        assert!(self.pending_zero_data_pages.is_empty());
-
        if lsn > self.lsn {
            self.pending_lsns.push(self.lsn);
            self.lsn = lsn;
@@ -1102,17 +1073,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means
-    /// keys that represent literal blocks that postgres can read.  So data includes relation blocks and
-    /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata.
-    ///
-    /// The distinction is important because data keys are handled on a fast path where dirty writes are
-    /// not readable until this modification is committed, whereas metadata keys are visible for read
-    /// via [`Self::get`] as soon as their record has been ingested.
-    fn is_data_key(key: &Key) -> bool {
-        key.is_rel_block_key() || key.is_slru_block_key()
-    }
-
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1205,13 +1165,6 @@ impl<'a> DatadirModification<'a> {
        img: Bytes,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let key = rel_block_to_key(rel, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
-        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -1223,63 +1176,10 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        let key = slru_block_to_key(kind, segno, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
-        }
-        self.put(key, Value::Image(img));
+        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
        Ok(())
    }

-    pub(crate) fn put_rel_page_image_zero(
-        &mut self,
-        rel: RelTag,
-        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let key = rel_block_to_key(rel, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
-        }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
-    }
-
-    pub(crate) fn put_slru_page_image_zero(
-        &mut self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        let key = slru_block_to_key(kind, segno, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
-        }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
-    }
-
-    /// Call this at the end of each WAL record.
-    pub(crate) fn on_record_end(&mut self) {
-        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
-        for key in pending_zero_data_pages {
-            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
-        }
-    }
-
    /// Store a relmapper file (pg_filenode.map) in the repository
    pub async fn put_relmap_file(
        &mut self,
@@ -1697,7 +1597,7 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
-                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
+                    warn!("this timeline is using deprecated aux file policy V1");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1878,7 +1778,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1889,11 +1789,26 @@ impl<'a> DatadirModification<'a> {
        let mut writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
+        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
+        for (key, values) in self.pending_updates.drain() {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
+                if key.is_rel_block_key() || key.is_slru_block_key() {
+                    // This bails out on first error without modifying pending_updates.
+                    // That's Ok, cf this function's doc comment.
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
+                } else {
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
+                }
+            }
+            writer.put_batch(write_batch, ctx).await?;
+        }

-        // This bails out on first error without modifying pending_updates.
-        // That's Ok, cf this function's doc comment.
-        writer.put_batch(pending_data_pages, ctx).await?;
+        self.pending_updates = retained_pending_updates;
        self.pending_bytes = 0;

        if pending_nblocks != 0 {
@@ -1914,31 +1829,26 @@ impl<'a> DatadirModification<'a> {
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        // Commit should never be called mid-wal-record
-        assert!(self.pending_zero_data_pages.is_empty());
-
        let mut writer = self.tline.writer().await;

        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        // Ordering: the items in this batch do not need to be in any global order, but values for
-        // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-        // this to do efficient updates to its index.
-        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
-
-        write_batch.extend(
-            self.pending_metadata_pages
+        if !self.pending_updates.is_empty() {
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
                .drain()
                .flat_map(|(key, values)| {
-                    values
-                        .into_iter()
-                        .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
-                }),
-        );
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();

-        if !write_batch.is_empty() {
-            writer.put_batch(write_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1969,58 +1879,33 @@ impl<'a> DatadirModification<'a> {
    }

    pub(crate) fn len(&self) -> usize {
-        self.pending_metadata_pages.len()
-            + self.pending_data_pages.len()
-            + self.pending_deletions.len()
+        self.pending_updates.len() + self.pending_deletions.len()
    }

-    /// Read a page from the Timeline we are writing to.  For metadata pages, this passes through
-    /// a cache in Self, which makes writes earlier in this modification visible to WAL records later
-    /// in the modification.
-    ///
-    /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data
-    /// page must ensure that the pages they read are already committed in Timeline, for example
-    /// DB create operations are always preceded by a call to commit().  This is special cased because
-    /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes,
-    /// and not data pages.
-    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        if !Self::is_data_key(&key) {
-            // Have we already updated the same key? Read the latest pending updated
-            // version in that case.
-            //
-            // Note: we don't check pending_deletions. It is an error to request a
-            // value that has been removed, deletion only avoids leaking storage.
-            if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) {
-                if let Some((_, _, value)) = values.last() {
-                    return if let Value::Image(img) = value {
-                        Ok(img.clone())
-                    } else {
-                        // Currently, we never need to read back a WAL record that we
-                        // inserted in the same "transaction". All the metadata updates
-                        // work directly with Images, and we never need to read actual
-                        // data pages. We could handle this if we had to, by calling
-                        // the walredo manager, but let's keep it simple for now.
-                        Err(PageReconstructError::Other(anyhow::anyhow!(
-                            "unexpected pending WAL record"
-                        )))
-                    };
-                }
-            }
-        } else {
-            // This is an expensive check, so we only do it in debug mode. If reading a data key,
-            // this key should never be present in pending_data_pages. We ensure this by committing
-            // modifications before ingesting DB create operations, which are the only kind that reads
-            // data pages during ingest.
-            if cfg!(debug_assertions) {
-                for (dirty_key, _, _, _) in &self.pending_data_pages {
-                    debug_assert!(&key.to_compact() != dirty_key);
-                }
+    // Internal helper functions to batch the modifications

-                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
+    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
+        // Have we already updated the same key? Read the latest pending updated
+        // version in that case.
+        //
+        // Note: we don't check pending_deletions. It is an error to request a
+        // value that has been removed, deletion only avoids leaking storage.
+        if let Some(values) = self.pending_updates.get(&key) {
+            if let Some((_, _, value)) = values.last() {
+                return if let Value::Image(img) = value {
+                    Ok(img.clone())
+                } else {
+                    // Currently, we never need to read back a WAL record that we
+                    // inserted in the same "transaction". All the metadata updates
+                    // work directly with Images, and we never need to read actual
+                    // data pages. We could handle this if we had to, by calling
+                    // the walredo manager, but let's keep it simple for now.
+                    Err(PageReconstructError::Other(anyhow::anyhow!(
+                        "unexpected pending WAL record"
+                    )))
+                };
            }
        }
-
-        // Metadata page cache miss, or we're reading a data page.
        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
        self.tline.get(key, lsn, ctx).await
    }
@@ -2032,40 +1917,11 @@ impl<'a> DatadirModification<'a> {
    }

    fn put(&mut self, key: Key, val: Value) {
-        if Self::is_data_key(&key) {
-            self.put_data(key.to_compact(), val)
-        } else {
-            self.put_metadata(key.to_compact(), val)
-        }
-    }
-
-    fn put_data(&mut self, key: CompactKey, val: Value) {
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-
-        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
-        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
-        // and the subsequent postgres-originating write
-        if self.pending_zero_data_pages.remove(&key) {
-            self.pending_bytes -= ZERO_PAGE.len();
-        }
-
-        self.pending_bytes += val_serialized_size;
-        self.pending_data_pages
-            .push((key, self.lsn, val_serialized_size, val))
-    }
-
-    fn put_metadata(&mut self, key: CompactKey, val: Value) {
-        let values = self.pending_metadata_pages.entry(key).or_default();
+        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
-                // Update the pending_bytes contribution from this entry, and update the serialized size in place
-                self.pending_bytes -= *last_value_ser_size;
                *last_value_ser_size = val.serialized_size().unwrap() as usize;
-                self.pending_bytes += *last_value_ser_size;
-
-                // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
-                // have been generated by synthesized zero page writes prior to the first real write to a page.
                *last_value = val;
                return;
            }
@@ -2084,7 +1940,6 @@ impl<'a> DatadirModification<'a> {

 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
-///
 /// During WAL ingestion, the records from multiple LSNs may be batched in the same
 /// modification before being flushed to the timeline. Hence, the routines in WalIngest
 /// need to look up the keys in the modification first before looking them up in the
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -60,7 +60,32 @@ pub mod mock {
    use regex::Regex;
    use tracing::log::info;

-    pub use pageserver_api::config::statvfs::mock::Behavior;
+    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[serde(tag = "type")]
+    pub enum Behavior {
+        Success {
+            blocksize: u64,
+            total_blocks: u64,
+            name_filter: Option<utils::serde_regex::Regex>,
+        },
+        Failure {
+            mocked_error: MockedError,
+        },
+    }
+
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[allow(clippy::upper_case_acronyms)]
+    pub enum MockedError {
+        EIO,
+    }
+
+    impl From<MockedError> for nix::Error {
+        fn from(e: MockedError) -> Self {
+            match e {
+                MockedError::EIO => nix::Error::EIO,
+            }
+        }
+    }

    pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
        info!("running mocked statvfs");
@@ -91,7 +116,6 @@ pub mod mock {
                    block_size: *blocksize,
                })
            }
-            #[cfg(feature = "testing")]
            Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
        }
    }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -146,12 +146,6 @@ impl FromStr for TokioRuntimeMode {
    }
 }

-static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
-    env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
-        // the default 2MiB are insufficent, especially in debug mode
-        .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
-});
-
 static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
    let thread_name = "pageserver-tokio";
    let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
@@ -170,7 +164,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
            tokio::runtime::Builder::new_current_thread()
                .thread_name(thread_name)
                .enable_all()
-                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                .build()
                .expect("failed to create one single runtime")
        }
@@ -180,7 +173,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
                .thread_name(thread_name)
                .enable_all()
                .worker_threads(num_workers.get())
-                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                .build()
                .expect("failed to create one multi-threaded runtime")
        }
@@ -207,7 +199,6 @@ macro_rules! pageserver_runtime {
                    .thread_name($name)
                    .worker_threads(TOKIO_WORKER_THREADS.get())
                    .enable_all()
-                    .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                    .build()
                    .expect(std::concat!("Failed to create runtime ", $name))
            });
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1,9 +1,8 @@
-//! Timeline repository implementation that keeps old data in layer files, and
-//! the recent changes in ephemeral files.
 //!
-//! See tenant/*_layer.rs files. The functions here are responsible for locating
-//! the correct layer for the get/put call, walking back the timeline branching
-//! history as needed.
+//! Timeline repository implementation that keeps old data in files on disk, and
+//! the recent changes in memory. See tenant/*_layer.rs files.
+//! The functions here are responsible for locating the correct layer for the
+//! get/put call, walking back the timeline branching history as needed.
 //!
 //! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
 //! directory. See docs/pageserver-storage.md for how the files are managed.
@@ -502,42 +501,6 @@ impl Debug for DeleteTimelineError {
    }
 }

-#[derive(thiserror::Error)]
-pub enum TimelineArchivalError {
-    #[error("NotFound")]
-    NotFound,
-
-    #[error("Timeout")]
-    Timeout,
-
-    #[error("ancestor is archived: {}", .0)]
-    HasArchivedParent(TimelineId),
-
-    #[error("HasUnarchivedChildren")]
-    HasUnarchivedChildren(Vec<TimelineId>),
-
-    #[error("Timeline archival is already in progress")]
-    AlreadyInProgress,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl Debug for TimelineArchivalError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::NotFound => write!(f, "NotFound"),
-            Self::Timeout => write!(f, "Timeout"),
-            Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
-            Self::HasUnarchivedChildren(c) => {
-                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
-            }
-            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
-            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
-        }
-    }
-}
-
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
@@ -882,12 +845,6 @@ impl Tenant {
                        });
                    };

-                // TODO: should also be rejecting tenant conf changes that violate this check.
-                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
-                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    return Ok(());
-                }
-
                let mut init_order = init_order;
                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
@@ -1369,59 +1326,24 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        state: TimelineArchivalState,
-    ) -> Result<(), TimelineArchivalError> {
-        info!("setting timeline archival config");
-        let timeline = {
-            let timelines = self.timelines.lock().unwrap();
-
-            let Some(timeline) = timelines.get(&timeline_id) else {
-                return Err(TimelineArchivalError::NotFound);
-            };
-
-            if state == TimelineArchivalState::Unarchived {
-                if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
-                    if ancestor_timeline.is_archived() == Some(true) {
-                        return Err(TimelineArchivalError::HasArchivedParent(
-                            ancestor_timeline.timeline_id,
-                        ));
-                    }
-                }
-            }
-
-            // Ensure that there are no non-archived child timelines
-            let children: Vec<TimelineId> = timelines
-                .iter()
-                .filter_map(|(id, entry)| {
-                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
-                        return None;
-                    }
-                    if entry.is_archived() == Some(true) {
-                        return None;
-                    }
-                    Some(*id)
-                })
-                .collect();
-
-            if !children.is_empty() && state == TimelineArchivalState::Archived {
-                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
-            }
-            Arc::clone(timeline)
-        };
+    ) -> anyhow::Result<()> {
+        let timeline = self
+            .get_timeline(timeline_id, false)
+            .context("Cannot apply timeline archival config to inexistent timeline")?;

        let upload_needed = timeline
            .remote_client
            .schedule_index_upload_for_timeline_archival_state(state)?;

        if upload_needed {
-            info!("Uploading new state");
            const MAX_WAIT: Duration = Duration::from_secs(10);
            let Ok(v) =
                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
            else {
                tracing::warn!("reached timeout for waiting on upload queue");
-                return Err(TimelineArchivalError::Timeout);
+                bail!("reached timeout for upload queue flush");
            };
-            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
+            v?;
        }
        Ok(())
    }
@@ -3819,21 +3741,13 @@ impl Tenant {
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
-    ///
-    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
-    /// than they report here, due to layer eviction.  Tenants with many active branches may
-    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
        let timelines = self.timelines.lock().unwrap();
-
-        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
-        // reflects the observation that on tenants with multiple large branches, typically only one
-        // of them is used actively enough to occupy space on disk.
-        timelines
-            .values()
-            .map(|t| t.metrics.visible_physical_size_gauge.get())
-            .max()
-            .unwrap_or(0)
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
    }
 }

@@ -7091,14 +7005,18 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: {
+                        let mut key = Key::MAX;
+                        key.field6 -= 1;
+                        Key::MIN..key
+                    },
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
-                // The delta layer below the horizon
+                // The delta layer that is cut in the middle
                PersistentLayerKey {
                    key_range: get_key(3)..get_key(4),
-                    lsn_range: Lsn(0x30)..Lsn(0x48),
+                    lsn_range: Lsn(0x30)..Lsn(0x41),
                    is_delta: true
                },
                // The delta3 layer that should not be picked for the compaction
@@ -8078,214 +7996,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
-    {
-        let harness =
-            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
-                .await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(1),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x38),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    // delta1 and delta 2 only contain a single key but multiple updates
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
-                ],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
-                    space: Lsn(0x30),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_20 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_10 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            let gc_horizon = {
-                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
-            };
-            for idx in 0..10 {
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), gc_horizon, &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_20[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_10[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
-pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

 pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
            } else {
                // Write a 4-byte length header
-                if len > MAX_SUPPORTED_BLOB_LEN {
+                if len > MAX_SUPPORTED_LEN {
                    return (
                        (
                            io_buf.slice_len(),
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,6 +2,7 @@
 //! Low-level Block-oriented I/O functions
 //!

+use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -80,7 +81,9 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
    FileBlockReader(&'a FileBlockReader<'a>),
+    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
+    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
    #[cfg(test)]
@@ -97,7 +100,9 @@ impl<'a> BlockReaderRef<'a> {
        use BlockReaderRef::*;
        match self {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
+            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
            Adapter(r) => r.read_blk(blknum, ctx).await,
+            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -106,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
    }
 }

+impl<'a> BlockReaderRef<'a> {
+    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
+        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
+        let end = start.checked_add(PAGE_SZ).unwrap();
+        if end > slice.len() {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                format!("slice too short, len={} end={}", slice.len(), end),
+            ));
+        }
+        let slice = &slice[start..end];
+        let page_sized: &[u8; PAGE_SZ] = slice
+            .try_into()
+            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
+        Ok(BlockLease::Slice(page_sized))
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,10 +9,11 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -22,6 +23,50 @@ use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;

+pub mod defaults {
+
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
+    // This parameter actually determines L0 layer file size.
+    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
+
+    // FIXME the below configs are only used by legacy algorithm. The new algorithm
+    // has different parameters.
+
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
+
+    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
+    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
+        super::CompactionAlgorithm::Legacy;
+
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
+    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    // The default limit on WAL lag should be set to avoid causing disconnects under high throughput
+    // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
+    // throughputs up to 1GiB/s per timeline.
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
+}
+
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
    /// Our generation is current as far as we know, and as far as we know we are the only attached
@@ -236,20 +281,96 @@ impl LocationConf {
    }
 }

-impl Default for LocationConf {
-    // TODO: this should be removed once tenant loading can guarantee that we are never
-    // loading from a directory without a configuration.
-    // => tech debt since https://github.com/neondatabase/neon/issues/1555
-    fn default() -> Self {
-        Self {
-            mode: LocationMode::Attached(AttachedLocationConfig {
-                generation: Generation::none(),
-                attach_mode: AttachmentMode::Single,
-            }),
-            tenant_conf: TenantConfOpt::default(),
-            shard: ShardIdentity::unsharded(),
-        }
-    }
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct TenantConf {
+    // Flush out an inmemory layer, if it's holding WAL older than this
+    // This puts a backstop on how much WAL needs to be re-digested if the
+    // page server crashes.
+    // This parameter actually determines L0 layer file size.
+    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    #[serde(with = "humantime_serde")]
+    pub checkpoint_timeout: Duration,
+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub compaction_target_size: u64,
+    // How often to check if there's compaction work to be done.
+    // Duration::ZERO means automatic compaction is disabled.
+    #[serde(with = "humantime_serde")]
+    pub compaction_period: Duration,
+    // Level0 delta layer threshold for compaction.
+    pub compaction_threshold: usize,
+    pub compaction_algorithm: CompactionAlgorithmSettings,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is #of bytes of WAL.
+    // Page versions older than this are garbage collected away.
+    pub gc_horizon: u64,
+    // Interval at which garbage collection is triggered.
+    // Duration::ZERO means automatic GC is disabled
+    #[serde(with = "humantime_serde")]
+    pub gc_period: Duration,
+    // Delta layer churn threshold to create L1 image layers.
+    pub image_creation_threshold: usize,
+    // Determines how much history is retained, to allow
+    // branching and read replicas at an older point in time.
+    // The unit is time.
+    // Page versions older than this are garbage collected away.
+    #[serde(with = "humantime_serde")]
+    pub pitr_interval: Duration,
+    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Duration,
+    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
+    /// A stalled safekeeper will be changed to a newer one when it appears.
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Duration,
+    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
+    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
+    /// to avoid eager reconnects.
+    pub max_lsn_wal_lag: NonZeroU64,
+    pub eviction_policy: EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
+    // See the corresponding metric's help string.
+    #[serde(with = "humantime_serde")]
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
+    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
+    /// may be disabled if a Tenant will not have secondary locations: only secondary
+    /// locations will use the heatmap uploaded by attached locations.
+    #[serde(with = "humantime_serde")]
+    pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
+
+    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
+    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
+    pub switch_aux_file_policy: AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -424,6 +545,51 @@ impl TenantConfOpt {
    }
 }

+impl Default for TenantConf {
+    fn default() -> Self {
+        use defaults::*;
+        Self {
+            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
+            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
+            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
+                .expect("cannot parse default compaction period"),
+            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
+            compaction_algorithm: CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
+            gc_horizon: DEFAULT_GC_HORIZON,
+            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
+                .expect("cannot parse default gc period"),
+            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
+                .expect("cannot parse default PITR interval"),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .expect("cannot parse default walreceiver connect timeout"),
+            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
+                .expect("cannot parse default walreceiver lagging wal timeout"),
+            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .expect("cannot parse default max walreceiver Lsn wal lag"),
+            eviction_policy: EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
+            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+        }
+    }
+}
+
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    type Error = anyhow::Error;

@@ -452,8 +618,7 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
            toml_edit::Item::Table(table) => {
-                let deserializer =
-                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
+                let deserializer = toml_edit::de::Deserializer::new(table.into());
                return serde_path_to_error::deserialize(deserializer)
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -1,21 +1,13 @@
 //! Implementation of append-only file data structure
 //! used to keep in-memory layers spilled on disk.

-use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
-use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
-use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
-use bytes::BytesMut;
+use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
-use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
-use tokio_epoll_uring::{BoundedBuf, Slice};
-use tracing::error;

 use std::io;
 use std::sync::atomic::AtomicU64;
@@ -24,17 +16,12 @@ use utils::id::TimelineId;
 pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-    page_cache_file_id: page_cache::FileId,
-    bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
-    _gate_guard: utils::sync::gate::GateGuard,
+
+    rw: page_caching::RW,
 }

-const TAIL_SZ: usize = 64 * 1024;
+mod page_caching;
+mod zero_padded_read_write;

 impl EphemeralFile {
    pub async fn create(
@@ -64,178 +51,75 @@ impl EphemeralFile {
        )
        .await?;

-        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            page_cache_file_id,
-            bytes_written: 0,
-            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
-            ),
-            _gate_guard: gate_guard,
+            rw: page_caching::RW::new(file, gate_guard),
        })
    }
-}

-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
-        }
-    }
-}
-
-impl EphemeralFile {
    pub(crate) fn len(&self) -> u64 {
-        self.bytes_written
+        self.rw.bytes_written()
    }

    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
+        self.rw.page_cache_file_id()
    }

+    /// See [`self::page_caching::RW::load_to_vec`].
    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        let size = self.len().into_usize();
-        let vec = Vec::with_capacity(size);
-        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
-        assert_eq!(nread, size);
-        let vec = slice.into_inner();
-        assert_eq!(vec.len(), nread);
-        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
-        Ok(vec)
+        self.rw.load_to_vec(ctx).await
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        self.rw.read_blk(blknum, ctx).await
+    }
+
+    #[cfg(test)]
+    // This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
+    pub(crate) async fn write_blob(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();
+
+        let mut len_bytes = std::io::Cursor::new(Vec::new());
+        crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
+            srcbuf.len(),
+            &mut len_bytes,
+        );
+        let len_bytes = len_bytes.into_inner();
+
+        // Write the length field
+        self.rw.write_all_borrowed(&len_bytes, ctx).await?;
+
+        // Write the payload
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+
+        Ok(pos)
    }

    /// Returns the offset at which the first byte of the input was written, for use
    /// in constructing indices over the written value.
-    ///
-    /// Panics if the write is short because there's no way we can recover from that.
-    /// TODO: make upstack handle this as an error.
    pub(crate) async fn write_raw(
        &mut self,
        srcbuf: &[u8],
        ctx: &RequestContext,
-    ) -> std::io::Result<u64> {
-        let pos = self.bytes_written;
-
-        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
-            std::io::Error::new(
-                std::io::ErrorKind::Other,
-                format!(
-                    "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
-                    srcbuf_len = srcbuf.len(),
-                ),
-            )
-        })?;
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();

        // Write the payload
-        let nwritten = self
-            .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
-            .await?;
-        assert_eq!(
-            nwritten,
-            srcbuf.len(),
-            "buffered writer has no short writes"
-        );
-
-        self.bytes_written = new_bytes_written;
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;

        Ok(pos)
    }
 }

-impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
-        &'b self,
-        start: u64,
-        dst: tokio_epoll_uring::Slice<B>,
-        ctx: &'a RequestContext,
-    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
-
-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
-
-        let dst_cap = dst.bytes_total().into_u64();
-        let end = {
-            // saturating_add is correct here because the max file size is u64::MAX, so,
-            // if start + dst.len() > u64::MAX, then we know it will be a short read
-            let mut end: u64 = start.saturating_add(dst_cap);
-            if end > self.bytes_written {
-                end = self.bytes_written;
-            }
-            end
-        };
-
-        // inclusive, exclusive
-        #[derive(Debug)]
-        struct Range<N>(N, N);
-        impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
-            fn len(&self) -> N {
-                if self.0 > self.1 {
-                    N::zero()
-                } else {
-                    self.1 - self.0
-                }
-            }
-        }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
-
-        let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
-            let bounds = dst.bounds();
-            let slice = file
-                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
-                .await?;
-            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
-        } else {
-            dst
-        };
-
-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
-                .0
-                .checked_sub(flushed_offset)
-                .unwrap()
-                .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
-            let bounds = dst.bounds();
-            let mut view = dst.slice({
-                let start = written_range.len().into_usize();
-                let end = start
-                    .checked_add(buffered_range.len().into_usize())
-                    .unwrap();
-                start..end
-            });
-            view.as_mut_rust_slice_full_zeroed()
-                .copy_from_slice(to_copy);
-            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
-        } else {
-            dst
-        };
-
-        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
-
-        Ok((dst, (end - start).into_usize()))
-    }
-}
-
 /// Does the given filename look like an ephemeral file?
 pub fn is_ephemeral_file(filename: &str) -> bool {
    if let Some(rest) = filename.strip_prefix("ephemeral-") {
@@ -245,13 +129,19 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl BlockReader for EphemeralFile {
+    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use rand::Rng;
-
    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
+    use crate::tenant::block_io::BlockReaderRef;
+    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -282,6 +172,69 @@ mod tests {
        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }

+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
+
+        let pos_foo = file.write_blob(b"foo", &ctx).await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
+        );
+        let pos_bar = file.write_blob(b"bar", &ctx).await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor()
+                .read_blob(pos_bar, &ctx)
+                .await?
+                .as_slice()
+        );
+
+        let mut blobs = Vec::new();
+        for i in 0..10000 {
+            let data = Vec::from(format!("blob{}", i).as_bytes());
+            let pos = file.write_blob(&data, &ctx).await?;
+            blobs.push((pos, data));
+        }
+        // also test with a large blobs
+        for i in 0..100 {
+            let data = format!("blob{}", i).as_bytes().repeat(100);
+            let pos = file.write_blob(&data, &ctx).await?;
+            blobs.push((pos, data));
+        }
+
+        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
+        for (pos, expected) in blobs {
+            let actual = cursor.read_blob(pos, &ctx).await?;
+            assert_eq!(actual, expected);
+        }
+
+        // Test a large blob that spans multiple pages
+        let mut large_data = vec![0; 20000];
+        thread_rng().fill_bytes(&mut large_data);
+        let pos_large = file.write_blob(&large_data, &ctx).await?;
+        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
+        assert_eq!(result, large_data);
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn ephemeral_file_holds_gate_open() {
        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
@@ -315,151 +268,4 @@ mod tests {
            .expect("closing completes right away")
            .expect("closing does not panic");
    }
-
-    #[tokio::test]
-    async fn test_ephemeral_file_basics() {
-        let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let write_nbytes = cap + cap / 2;
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(write_nbytes)
-            .collect();
-
-        let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
-            value_offsets.push(off);
-        }
-
-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = Vec::with_capacity(1);
-            let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
-                .await
-                .unwrap();
-            let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
-        }
-
-        let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
-
-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
-    }
-
-    #[tokio::test]
-    async fn test_flushes_do_happen() {
-        let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
-            .collect();
-
-        file.write_raw(&content, &ctx).await.unwrap();
-
-        // assert the state is as this test expects it to be
-        assert_eq!(
-            &file.load_to_vec(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
-        );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path
-            .metadata()
-            .unwrap();
-        assert_eq!(
-            md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
-        );
-        assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
-        );
-    }
-
-    #[tokio::test]
-    async fn test_read_split_across_file_and_buffer() {
-        // This test exercises the logic on the read path that splits the logical read
-        // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
-        //
-        // This test build on the assertions in test_flushes_do_happen
-
-        let (conf, tenant_id, timeline_id, ctx) =
-            harness("test_read_split_across_file_and_buffer").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
-            .collect();
-
-        file.write_raw(&content, &ctx).await.unwrap();
-
-        let test_read = |start: usize, len: usize| {
-            let file = &file;
-            let ctx = &ctx;
-            let content = &content;
-            async move {
-                let (buf, nread) = file
-                    .read_exact_at_eof_ok(
-                        start.into_u64(),
-                        Vec::with_capacity(len).slice_full(),
-                        ctx,
-                    )
-                    .await
-                    .unwrap();
-                assert_eq!(nread, len);
-                assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
-            }
-        };
-
-        // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
-    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -0,0 +1,153 @@
+//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
+//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
+
+use crate::context::RequestContext;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::VirtualFile;
+
+use std::io::{self};
+use tokio_epoll_uring::BoundedBuf;
+use tracing::*;
+
+use super::zero_padded_read_write;
+
+/// See module-level comment.
+pub struct RW {
+    page_cache_file_id: page_cache::FileId,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl RW {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
+        let page_cache_file_id = page_cache::next_file_id();
+        Self {
+            page_cache_file_id,
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
+            _gate_guard,
+        }
+    }
+
+    pub fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
+    pub(crate) async fn write_all_borrowed(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<usize, io::Error> {
+        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
+        // because Compute is unlikely to access recently written data.
+        self.rw.write_all_borrowed(srcbuf, ctx).await
+    }
+
+    pub(crate) fn bytes_written(&self) -> u64 {
+        self.rw.bytes_written()
+    }
+
+    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
+    ///
+    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
+    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
+    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        // round up to the next PAGE_SZ multiple, required by blob_io
+        let size = {
+            let s = usize::try_from(self.bytes_written()).unwrap();
+            if s % PAGE_SZ == 0 {
+                s
+            } else {
+                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
+            }
+        };
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffered = self.rw.get_tail_zero_padded();
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        assert_eq!(vec.len() % PAGE_SZ, 0);
+        Ok(vec)
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        match self.rw.read_blk(blknum).await? {
+            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
+                let cache = page_cache::get();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.rw.as_writer().as_inner().path,
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(write_guard) => {
+                        let write_guard = writer
+                            .as_inner()
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
+                            .await?;
+                        let read_guard = write_guard.mark_valid();
+                        return Ok(BlockLease::PageReadGuard(read_guard));
+                    }
+                }
+            }
+            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
+                Ok(BlockLease::EphemeralFileMutableTail(buffer))
+            }
+        }
+    }
+}
+
+impl Drop for RW {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        // we are clear to do this, because we have entered a gate
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!("could not remove ephemeral file '{path}': {e}");
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -0,0 +1,145 @@
+//! The heart of how [`super::EphemeralFile`] does its reads and writes.
+//!
+//! # Writes
+//!
+//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
+//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
+//!
+//! # Reads
+//!
+//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
+//!
+//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
+//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
+//! if the read is for the prefix that has already been flushed.
+//!
+//! # Current Usage
+//!
+//! The current user of this module is [`super::page_caching::RW`].
+
+mod zero_padded;
+
+use crate::{
+    context::RequestContext,
+    page_cache::PAGE_SZ,
+    virtual_file::owned_buffers_io::{
+        self,
+        write::{Buffer, OwnedAsyncWriter},
+    },
+};
+
+const TAIL_SZ: usize = 64 * 1024;
+
+/// See module-level comment.
+pub struct RW<W: OwnedAsyncWriter> {
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        zero_padded::Buffer<TAIL_SZ>,
+        owned_buffers_io::util::size_tracking_writer::Writer<W>,
+    >,
+}
+
+pub enum ReadResult<'a, W> {
+    NeedsReadFromWriter { writer: &'a W },
+    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
+}
+
+impl<W> RW<W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        let bytes_flushed_tracker =
+            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
+        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
+            bytes_flushed_tracker,
+            zero_padded::Buffer::default(),
+        );
+        Self { buffered_writer }
+    }
+
+    pub(crate) fn as_writer(&self) -> &W {
+        self.buffered_writer.as_inner().as_inner()
+    }
+
+    pub async fn write_all_borrowed(
+        &mut self,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
+    }
+
+    pub fn bytes_written(&self) -> u64 {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        flushed_offset + u64::try_from(buffer.pending()).unwrap()
+    }
+
+    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
+    pub fn get_tail_zero_padded(&self) -> &[u8] {
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffer_written_up_to = buffer.pending();
+        // pad to next page boundary
+        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
+            buffer_written_up_to
+        } else {
+            buffer_written_up_to
+                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
+                .unwrap()
+        };
+        &buffer.as_zero_padded_slice()[0..read_up_to]
+    }
+
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
+        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
+
+        // The trailing page ("block") might only be partially filled,
+        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
+        // Moreover, it has to be zero-padded, because when we still had
+        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
+        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
+        // => check here that the read doesn't go beyond this potentially trailing
+        // => the zero-padding is done in the `else` branch below
+        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
+            buffered_offset / (PAGE_SZ as u64)
+        } else {
+            (buffered_offset / (PAGE_SZ as u64)) + 1
+        };
+        if (blknum as u64) >= blocks_written {
+            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
+        }
+
+        // assertions for the `if-else` below
+        assert_eq!(
+            flushed_offset % (TAIL_SZ as u64), 0,
+            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
+        );
+        assert_eq!(
+            flushed_offset % (PAGE_SZ as u64),
+            0,
+            "the logic below can't handle if the page is spread across the flushed part and the buffer"
+        );
+
+        if read_offset < flushed_offset {
+            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
+            Ok(ReadResult::NeedsReadFromWriter {
+                writer: self.as_writer(),
+            })
+        } else {
+            let read_offset_in_buffer = read_offset
+                .checked_sub(flushed_offset)
+                .expect("would have taken `if` branch instead of this one");
+            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
+            let zero_padded_slice = buffer.as_zero_padded_slice();
+            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
+            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
+                buffer: page
+                    .try_into()
+                    .expect("the slice above got it as page-size slice"),
+            })
+        }
+    }
+}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -0,0 +1,110 @@
+//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
+//! unwritten range is guaranteed to be zero-initialized.
+//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
+//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
+
+use std::mem::MaybeUninit;
+
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+
+/// See module-level comment.
+pub struct Buffer<const N: usize> {
+    allocation: Box<[u8; N]>,
+    written: usize,
+}
+
+impl<const N: usize> Default for Buffer<N> {
+    fn default() -> Self {
+        Self {
+            allocation: Box::new(
+                // SAFETY: zeroed memory is a valid [u8; N]
+                unsafe { MaybeUninit::zeroed().assume_init() },
+            ),
+            written: 0,
+        }
+    }
+}
+
+impl<const N: usize> Buffer<N> {
+    #[inline(always)]
+    fn invariants(&self) {
+        // don't check by default, unoptimized is too expensive even for debug mode
+        if false {
+            debug_assert!(self.written <= N, "{}", self.written);
+            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
+        }
+    }
+
+    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
+        &self.allocation
+    }
+}
+
+impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
+    type IoBuf = Self;
+
+    fn cap(&self) -> usize {
+        self.allocation.len()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        self.invariants();
+        let remaining = self.allocation.len() - self.written;
+        if other.len() > remaining {
+            panic!("calling extend_from_slice() with insufficient remaining capacity");
+        }
+        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
+        self.written += other.len();
+        self.invariants();
+    }
+
+    fn pending(&self) -> usize {
+        self.written
+    }
+
+    fn flush(self) -> FullSlice<Self> {
+        self.invariants();
+        let written = self.written;
+        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
+    }
+
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let Self {
+            mut allocation,
+            written,
+        } = iobuf;
+        allocation[0..written].fill(0);
+        let new = Self {
+            allocation,
+            written: 0,
+        };
+        new.invariants();
+        new
+    }
+}
+
+/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
+/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
+///
+/// Remember that bytes_init is generally _not_ a tracker of the amount
+/// of valid data in the io buffer; we use `Slice` for that.
+/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
+///
+/// SAFETY:
+///
+/// The [`Self::allocation`] is stable becauses boxes are stable.
+/// The memory is zero-initialized, so, bytes_init is always N.
+unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.allocation.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        // Yes, N, not self.written; Read the full comment of this impl block!
+        N
+    }
+
+    fn bytes_total(&self) -> usize {
+        N
+    }
+}
--- a/Show More
+++ b/Show More