fix

fix pin
asyncreadready
2026-01-24 13:50:37 +00:00 · 2024-08-21 18:44:57 +01:00 · 2024-08-21 16:29:52 +01:00 · 2024-08-21 16:16:49 +01:00 · 2024-08-21 15:28:25 +01:00 · 2024-08-21 14:42:41 +01:00
246 changed files with 4292 additions and 9069 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -23,30 +23,10 @@ platforms = [
 ]

 [final-excludes]
-workspace-members = [
-    # vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-    # it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-    # from depending on workspace-hack because most of the dependencies are not used.
-    "vm_monitor",
-    # All of these exist in libs and are not usually built independently.
-    # Putting workspace hack there adds a bottleneck for cargo builds.
-    "compute_api",
-    "consumption_metrics",
-    "desim",
-    "metrics",
-    "pageserver_api",
-    "postgres_backend",
-    "postgres_connection",
-    "postgres_ffi",
-    "pq_proto",
-    "remote_storage",
-    "safekeeper_api",
-    "tenant_size_model",
-    "tracing-utils",
-    "utils",
-    "wal_craft",
-    "walproposer",
-]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]

 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,6 +0,0 @@
-
-blank_issues_enabled: true
-contact_links:
-  - name: Feature request
-    url: https://console.neon.tech/app/projects?modal=feedback
-    about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,7 +71,7 @@ runs:
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
        # The lack of compatibility snapshot (for example, for the new Postgres version)
@@ -169,8 +169,10 @@ runs:
          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
        fi

-        if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
+        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
+          cov_prefix=()
        else
          cov_prefix=()
        fi
@@ -211,13 +213,13 @@ runs:
        fi

    - name: Upload compatibility snapshot
-      # Note, that we use `github.base_ref` which is a target branch for a PR
-      if: github.event_name == 'pull_request' && github.base_ref == 'release'
+      if: github.ref_name == 'release'
      uses: ./.github/actions/upload
      with:
-        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
+        prefix: latest

    - name: Upload test results
      if: ${{ !cancelled() }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -94,16 +94,11 @@ jobs:
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
      - name: Set env variables
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          CARGO_FEATURES="--features testing"
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FLAGS="--locked"
-          elif [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=""
-            CARGO_FLAGS="--locked"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
@@ -163,8 +158,6 @@ jobs:
      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
      - name: Install rust binaries
-        env:
-          ARCH: ${{ inputs.arch }}
        run: |
          # Install target binaries
          mkdir -p /tmp/neon/bin/
@@ -179,7 +172,7 @@ jobs:
          done

          # Install test executables and write list of all binaries (for code coverage)
-          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
+          if [[ $BUILD_TYPE == "debug" ]]; then
            # Keep bloated coverage data files away from the rest of the artifact
            mkdir -p /tmp/coverage/

@@ -216,14 +209,8 @@ jobs:
          #nextest does not yet support running doctests
          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES

-          # run all non-pageserver tests
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
-
-          # run pageserver tests with different settings
          for io_engine in std-fs tokio-epoll-uring ; do
-            for io_buffer_alignment in 0 1 512 ; do
-              NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-            done
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
          done

          # Run separate tests for real S3
@@ -256,8 +243,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    # Don't run regression tests on debug arm64 builds
-    if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
+    # Run test on x64 only
+    if: inputs.arch == 'x64'
    needs: [ build-neon ]
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
    container:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -198,7 +198,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        arch: [ x64, arm64 ]
+        arch: [ x64 ]
        # Do not build or run tests in debug for release branches
        build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
        include:
@@ -1055,88 +1055,43 @@ jobs:
              generate_release_notes: true,
            })

-  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
-    needs: [ deploy ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
    if: github.ref_name == 'release'

-    runs-on: ubuntu-22.04
+    runs-on: [ self-hosted, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
-      - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
-        id: fetch-last-release-pr-info
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          branch_name_and_pr_number=$(gh pr list \
-            --repo "${GITHUB_REPOSITORY}" \
-            --base release \
-            --state merged \
-            --limit 10 \
-            --json mergeCommit,headRefName,number \
-            --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
-          branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
-          pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
-
-          run_id=$(gh run list \
-            --repo "${GITHUB_REPOSITORY}" \
-            --workflow build_and_test.yml \
-            --branch "${branch_name}" \
-            --json databaseId \
-            --limit 1 \
-            --jq '.[].databaseId')
-
-          last_commit_sha=$(gh pr view "${pr_number}" \
-            --repo "${GITHUB_REPOSITORY}" \
-            --json commits \
-            --jq '.commits[-1].oid')
-
-          echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
-          echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
-
-      - name: Promote compatibility snapshot and Neon artifact
+      - name: Promote compatibility snapshot for the release
        env:
          BUCKET: neon-github-public-dev
-          AWS_REGION: eu-central-1
-          COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
-          RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
+          PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
-          new_prefix="artifacts/latest"
-
-          files_to_promote=()
-          files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
-
-          for arch in X64 ARM64; do
+          # Update compatibility snapshot for the release
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
-              neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
-              s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
-              if [ -z "${s3_key}" ]; then
-                echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
-                exit 1
-              fi
+              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
+              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst

-              files_to_promote+=("s3://${BUCKET}/${s3_key}")
-
-              for pg_version in v14 v15 v16; do
-                # We run less tests for debug builds, so we don't need to promote them
-                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
-                  continue
-                fi
-
-                compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
-                s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
-                if [ -z "${s3_key}" ]; then
-                  echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
-                  exit 1
-                fi
-
-                files_to_promote+=("s3://${BUCKET}/${s3_key}")
-              done
+              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
            done
          done

-          for f in "${files_to_promote[@]}"; do
-            time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
+          # Update Neon artifact for the release (reuse already uploaded artifact)
+          for build_type in debug release; do
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
+
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            if [ -z "${S3_KEY}" ]; then
+              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
+              exit 1
+            fi
+
+            time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
          done

  pin-build-tools-image:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -316,6 +316,33 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "aws-lc-rs"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ae74d9bd0a7530e8afd1770739ad34b36838829d6ad61818f9230f683f5ad77"
+dependencies = [
+ "aws-lc-sys",
+ "mirai-annotations",
+ "paste",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f0e249228c6ad2d240c2dc94b714d711629d52bad946075d8e9b2f5391f0703"
+dependencies = [
+ "bindgen 0.69.4",
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+ "libc",
+ "paste",
+]
+
 [[package]]
 name = "aws-runtime"
 version = "1.2.1"
@@ -926,7 +953,7 @@ dependencies = [
 "lazycell",
 "log",
 "peeking_take_while",
- "prettyplease 0.2.6",
+ "prettyplease 0.2.17",
 "proc-macro2",
 "quote",
 "regex",
@@ -937,10 +964,27 @@ dependencies = [
 ]

 [[package]]
-name = "bit_field"
-version = "0.10.2"
+name = "bindgen"
+version = "0.69.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
+checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0"
+dependencies = [
+ "bitflags 2.4.1",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.12.1",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease 0.2.17",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.52",
+ "which",
+]

 [[package]]
 name = "bitflags"
@@ -1062,6 +1106,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "cgroups-rs"
 version = "0.3.3"
@@ -1170,6 +1220,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"

+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1214,6 +1273,7 @@ dependencies = [
 "serde_json",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1326,6 +1386,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -1333,6 +1394,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "camino",
 "clap",
 "comfy-table",
@@ -1495,7 +1557,7 @@ dependencies = [
 "bitflags 1.3.2",
 "crossterm_winapi",
 "libc",
- "mio",
+ "mio 0.8.11",
 "parking_lot 0.12.1",
 "signal-hook",
 "signal-hook-mio",
@@ -1673,13 +1735,14 @@ dependencies = [
 "smallvec",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
 name = "diesel"
-version = "2.2.3"
+version = "2.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
+checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -1770,6 +1833,12 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2071,6 +2140,12 @@ dependencies = [
 "tokio-util",
 ]

+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2404,9 +2479,9 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"

 [[package]]
 name = "hermit-abi"
-version = "0.3.3"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
+checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"

 [[package]]
 name = "hex"
@@ -2924,6 +2999,33 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "ktls"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe51e4a53d53b396707537bc8a5277798b720fb71f0d1b9c63eb53199a00fde"
+dependencies = [
+ "futures-util",
+ "ktls-sys",
+ "libc",
+ "memoffset 0.9.1",
+ "nix 0.29.0",
+ "num_enum",
+ "pin-project-lite",
+ "rustls 0.23.12",
+ "smallvec",
+ "thiserror",
+ "tokio",
+ "tokio-rustls 0.26.0",
+ "tracing",
+]
+
+[[package]]
+name = "ktls-sys"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "095b1fc8d841c3df8c3f2db78b7425cb2ec424568a282cb589a880b99d256e84"
+
 [[package]]
 name = "lasso"
 version = "0.7.2"
@@ -2950,10 +3052,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

 [[package]]
-name = "libc"
-version = "0.2.150"
+name = "leaky-bucket"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
+dependencies = [
+ "parking_lot 0.12.1",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.158"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"

 [[package]]
 name = "libloading"
@@ -3117,9 +3230,9 @@ dependencies = [

 [[package]]
 name = "memoffset"
-version = "0.9.0"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
 dependencies = [
 "autocfg",
 ]
@@ -3138,6 +3251,7 @@ dependencies = [
 "rand 0.8.5",
 "rand_distr",
 "twox-hash",
+ "workspace_hack",
 ]

 [[package]]
@@ -3194,6 +3308,24 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "mio"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "mirai-annotations"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
+
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -3234,7 +3366,20 @@ dependencies = [
 "bitflags 2.4.1",
 "cfg-if",
 "libc",
- "memoffset 0.9.0",
+ "memoffset 0.9.1",
+]
+
+[[package]]
+name = "nix"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if",
+ "cfg_aliases",
+ "libc",
+ "memoffset 0.9.1",
 ]

 [[package]]
@@ -3261,7 +3406,7 @@ dependencies = [
 "kqueue",
 "libc",
 "log",
- "mio",
+ "mio 0.8.11",
 "walkdir",
 "windows-sys 0.48.0",
 ]
@@ -3383,6 +3528,27 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "num_enum"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179"
+dependencies = [
+ "num_enum_derive",
+]
+
+[[package]]
+name = "num_enum_derive"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56"
+dependencies = [
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 name = "oauth2"
 version = "4.4.2"
@@ -3677,7 +3843,6 @@ dependencies = [
 "async-compression",
 "async-stream",
 "async-trait",
- "bit_field",
 "byteorder",
 "bytes",
 "camino",
@@ -3702,6 +3867,7 @@ dependencies = [
 "humantime-serde",
 "hyper 0.14.26",
 "itertools 0.10.5",
+ "leaky-bucket",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -3726,7 +3892,6 @@ dependencies = [
 "reqwest 0.12.4",
 "rpds",
 "scopeguard",
- "send-future",
 "serde",
 "serde_json",
 "serde_path_to_error",
@@ -3782,6 +3947,7 @@ dependencies = [
 "strum_macros",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -3789,6 +3955,7 @@ name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
 "futures",
 "pageserver_api",
@@ -4045,9 +4212,9 @@ dependencies = [

 [[package]]
 name = "pin-project-lite"
-version = "0.2.13"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"

 [[package]]
 name = "pin-utils"
@@ -4172,16 +4339,17 @@ dependencies = [
 "futures",
 "once_cell",
 "pq_proto",
- "rustls 0.22.4",
+ "rustls 0.23.12",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "tokio-util",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4194,6 +4362,7 @@ dependencies = [
 "postgres",
 "tokio-postgres",
 "url",
+ "workspace_hack",
 ]

 [[package]]
@@ -4201,7 +4370,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen",
+ "bindgen 0.65.1",
 "byteorder",
 "bytes",
 "crc32c",
@@ -4216,6 +4385,7 @@ dependencies = [
 "serde",
 "thiserror",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -4253,6 +4423,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tracing",
+ "workspace_hack",
 ]

 [[package]]
@@ -4267,9 +4438,9 @@ dependencies = [

 [[package]]
 name = "prettyplease"
-version = "0.2.6"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
+checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
 "proc-macro2",
 "syn 2.0.52",
@@ -4284,6 +4455,15 @@ dependencies = [
 "elliptic-curve 0.13.8",
 ]

+[[package]]
+name = "proc-macro-crate"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284"
+dependencies = [
+ "toml_edit 0.21.1",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.20+deprecated"
@@ -4442,6 +4622,7 @@ dependencies = [
 "itertools 0.10.5",
 "jose-jwa",
 "jose-jwk",
+ "ktls",
 "lasso",
 "md5",
 "measured",
@@ -4472,7 +4653,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash",
- "rustls 0.22.4",
+ "rustls 0.23.12",
 "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -4491,7 +4672,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "tokio-tungstenite",
 "tokio-util",
 "tower-service",
@@ -4657,12 +4838,13 @@ dependencies = [

 [[package]]
 name = "rcgen"
-version = "0.12.1"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
+checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
 dependencies = [
 "pem",
 "ring 0.17.6",
+ "rustls-pki-types",
 "time",
 "yasna",
 ]
@@ -4817,6 +4999,7 @@ dependencies = [
 "toml_edit 0.19.10",
 "tracing",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5174,7 +5357,22 @@ dependencies = [
 "log",
 "ring 0.17.6",
 "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.6",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044"
+dependencies = [
+ "aws-lc-rs",
+ "log",
+ "once_cell",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.6",
 "subtle",
 "zeroize",
 ]
@@ -5225,9 +5423,9 @@ dependencies = [

 [[package]]
 name = "rustls-pki-types"
-version = "1.3.1"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0"

 [[package]]
 name = "rustls-webpki"
@@ -5251,10 +5449,11 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.102.2"
+version = "0.102.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e"
 dependencies = [
+ "aws-lc-rs",
 "ring 0.17.6",
 "rustls-pki-types",
 "untrusted 0.9.0",
@@ -5341,6 +5540,7 @@ dependencies = [
 "serde",
 "serde_with",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -5449,12 +5649,6 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"

-[[package]]
-name = "send-future"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
-
 [[package]]
 name = "sentry"
 version = "0.32.3"
@@ -5590,12 +5784,11 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.125"
+version = "1.0.96"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
+checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
 dependencies = [
 "itoa",
- "memchr",
 "ryu",
 "serde",
 ]
@@ -5701,9 +5894,9 @@ dependencies = [

 [[package]]
 name = "sha2-asm"
-version = "0.6.3"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e"
+checksum = "b845214d6175804686b2bd482bcffe96651bb2d1200742b712003504a2dac1ab"
 dependencies = [
 "cc",
 ]
@@ -5740,7 +5933,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
 dependencies = [
 "libc",
- "mio",
+ "mio 0.8.11",
 "signal-hook",
 ]

@@ -5802,9 +5995,9 @@ dependencies = [

 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"

 [[package]]
 name = "smol_str"
@@ -5950,6 +6143,7 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
 "futures",
 "pageserver_api",
@@ -5995,7 +6189,7 @@ dependencies = [
 "rand 0.8.5",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.22.4",
+ "rustls 0.23.12",
 "rustls-native-certs 0.7.0",
 "serde",
 "serde_json",
@@ -6005,7 +6199,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -6182,6 +6376,7 @@ dependencies = [
 "anyhow",
 "serde",
 "serde_json",
+ "workspace_hack",
 ]

 [[package]]
@@ -6216,18 +6411,18 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "1.0.57"
+version = "1.0.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
+checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
 dependencies = [
 "thiserror-impl",
 ]

 [[package]]
 name = "thiserror-impl"
-version = "1.0.57"
+version = "1.0.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
+checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6354,20 +6549,19 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.39.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5"
 dependencies = [
 "backtrace",
 "bytes",
 "libc",
- "mio",
- "num_cpus",
+ "mio 1.0.2",
 "pin-project-lite",
 "signal-hook-registry",
 "socket2 0.5.5",
 "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6398,9 +6592,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6432,16 +6626,15 @@ dependencies = [

 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
+checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
- "futures",
 "ring 0.17.6",
- "rustls 0.22.4",
+ "rustls 0.23.12",
 "tokio",
 "tokio-postgres",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "x509-certificate",
 ]

@@ -6466,6 +6659,17 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "tokio-rustls"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+dependencies = [
+ "rustls 0.23.12",
+ "rustls-pki-types",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-stream"
 version = "0.1.14"
@@ -6567,6 +6771,17 @@ dependencies = [
 "winnow 0.4.6",
 ]

+[[package]]
+name = "toml_edit"
+version = "0.21.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1"
+dependencies = [
+ "indexmap 2.0.1",
+ "toml_datetime",
+ "winnow 0.5.40",
+]
+
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6659,11 +6874,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

 [[package]]
 name = "tracing"
-version = "0.1.37"
+version = "0.1.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
 dependencies = [
- "cfg-if",
 "log",
 "pin-project-lite",
 "tracing-attributes",
@@ -6683,9 +6897,9 @@ dependencies = [

 [[package]]
 name = "tracing-attributes"
-version = "0.1.24"
+version = "0.1.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6694,9 +6908,9 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.31"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
 dependencies = [
 "once_cell",
 "valuable",
@@ -6782,6 +6996,7 @@ dependencies = [
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
@@ -6952,6 +7167,7 @@ dependencies = [
 "anyhow",
 "arc-swap",
 "async-compression",
+ "async-trait",
 "bincode",
 "byteorder",
 "bytes",
@@ -6967,6 +7183,7 @@ dependencies = [
 "humantime",
 "hyper 0.14.26",
 "jsonwebtoken",
+ "leaky-bucket",
 "metrics",
 "nix 0.27.1",
 "once_cell",
@@ -6997,6 +7214,7 @@ dependencies = [
 "url",
 "uuid",
 "walkdir",
+ "workspace_hack",
 ]

 [[package]]
@@ -7075,6 +7293,7 @@ dependencies = [
 "postgres_ffi",
 "regex",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7092,9 +7311,10 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "bindgen",
+ "bindgen 0.65.1",
 "postgres_ffi",
 "utils",
+ "workspace_hack",
 ]

 [[package]]
@@ -7545,6 +7765,15 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "winnow"
+version = "0.5.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7634,6 +7863,8 @@ dependencies = [
 "reqwest 0.11.19",
 "reqwest 0.12.4",
 "rustls 0.21.11",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.6",
 "scopeguard",
 "serde",
 "serde_json",
@@ -7651,6 +7882,8 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
+ "toml_datetime",
+ "toml_edit 0.19.10",
 "tonic",
 "tower",
 "tracing",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,6 @@ axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.65"
-bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
@@ -108,12 +107,13 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
+leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.8"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
@@ -139,13 +139,12 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.22"
+rustls = "0.23"
 rustls-pemfile = "2"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-send-future = "0.1.0"
 sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
@@ -172,8 +171,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.11.0"
-tokio-rustls = "0.25"
+tokio-postgres-rustls = "0.12.0"
+tokio-rustls = "0.26"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -233,7 +232,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.12"
+rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.9"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hint_plan.patch /ext-src
+COPY patches/pg_hintplan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
-RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN patch -p1 < /ext-src/pg_hintplan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.


 #### Running neon database
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
@@ -367,8 +366,6 @@ fn wait_spec(
        state.start_time = now;
    }

-    launch_lsn_lease_bg_task_for_static(&compute);
-
    Ok(WaitSpecResult {
        compute,
        http_port,
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,7 +11,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
-pub mod lsn_lease;
 mod migration;
 pub mod monitor;
 pub mod params;
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -1,186 +0,0 @@
-use anyhow::bail;
-use anyhow::Result;
-use postgres::{NoTls, SimpleQueryMessage};
-use std::time::SystemTime;
-use std::{str::FromStr, sync::Arc, thread, time::Duration};
-use utils::id::TenantId;
-use utils::id::TimelineId;
-
-use compute_api::spec::ComputeMode;
-use tracing::{info, warn};
-use utils::{
-    lsn::Lsn,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
-
-use crate::compute::ComputeNode;
-
-/// Spawns a background thread to periodically renew LSN leases for static compute.
-/// Do nothing if the compute is not in static mode.
-pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
-    let (tenant_id, timeline_id, lsn) = {
-        let state = compute.state.lock().unwrap();
-        let spec = state.pspec.as_ref().expect("Spec must be set");
-        match spec.spec.mode {
-            ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
-            _ => return,
-        }
-    };
-    let compute = compute.clone();
-
-    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
-    thread::spawn(move || {
-        let _entered = span.entered();
-        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
-            // TODO: might need stronger error feedback than logging an warning.
-            warn!("Exited with error: {e}");
-        }
-    });
-}
-
-/// Renews lsn lease periodically so static compute are not affected by GC.
-fn lsn_lease_bg_task(
-    compute: Arc<ComputeNode>,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-) -> Result<()> {
-    loop {
-        let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
-        let valid_duration = valid_until
-            .duration_since(SystemTime::now())
-            .unwrap_or(Duration::ZERO);
-
-        // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
-        let sleep_duration = valid_duration
-            .saturating_sub(Duration::from_secs(60))
-            .max(valid_duration / 2);
-
-        info!(
-            "Succeeded, sleeping for {} seconds",
-            sleep_duration.as_secs()
-        );
-        thread::sleep(sleep_duration);
-    }
-}
-
-/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
-/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
-fn acquire_lsn_lease_with_retry(
-    compute: &Arc<ComputeNode>,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-) -> Result<SystemTime> {
-    let mut attempts = 0usize;
-    let mut retry_period_ms: f64 = 500.0;
-    const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
-
-    loop {
-        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let configs = {
-            let state = compute.state.lock().unwrap();
-
-            let spec = state.pspec.as_ref().expect("spec must be set");
-
-            let conn_strings = spec.pageserver_connstr.split(',');
-
-            conn_strings
-                .map(|connstr| {
-                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
-                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        info!("Got storage auth token from spec file");
-                        config.password(storage_auth_token.clone());
-                    } else {
-                        info!("Storage auth token not set");
-                    }
-                    config
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
-        match result {
-            Ok(Some(res)) => {
-                return Ok(res);
-            }
-            Ok(None) => {
-                bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
-            }
-            Err(e) => {
-                warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
-
-                thread::sleep(Duration::from_millis(retry_period_ms as u64));
-                retry_period_ms *= 1.5;
-                retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
-            }
-        }
-        attempts += 1;
-    }
-}
-
-/// Tries to acquire an LSN lease through PS page_service API.
-fn try_acquire_lsn_lease(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    lsn: Lsn,
-    configs: &[postgres::Config],
-) -> Result<Option<SystemTime>> {
-    fn get_valid_until(
-        config: &postgres::Config,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<Option<SystemTime>> {
-        let mut client = config.connect(NoTls)?;
-        let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
-        let res = client.simple_query(&cmd)?;
-        let msg = match res.first() {
-            Some(msg) => msg,
-            None => bail!("empty response"),
-        };
-        let row = match msg {
-            SimpleQueryMessage::Row(row) => row,
-            _ => bail!("error parsing lsn lease response"),
-        };
-
-        // Note: this will be None if a lease is explicitly not granted.
-        let valid_until_str = row.get("valid_until");
-
-        let valid_until = valid_until_str.map(|s| {
-            SystemTime::UNIX_EPOCH
-                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
-                .expect("Time larger than max SystemTime could handle")
-        });
-        Ok(valid_until)
-    }
-
-    let shard_count = configs.len();
-
-    let valid_until = if shard_count > 1 {
-        configs
-            .iter()
-            .enumerate()
-            .map(|(shard_number, config)| {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount::new(shard_count as u8),
-                    shard_number: ShardNumber(shard_number as u8),
-                };
-                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
-            })
-            .collect::<Result<Vec<Option<SystemTime>>>>()?
-            .into_iter()
-            .min()
-            .unwrap()
-    } else {
-        get_valid_until(
-            &configs[0],
-            TenantShardId::unsharded(tenant_id),
-            timeline_id,
-            lsn,
-        )?
-    };
-
-    Ok(valid_until)
-}
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,7 +5,6 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
-use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
 use std::time::Duration;
@@ -35,10 +34,12 @@ pub enum SafekeeperHttpError {

 type Result<T> = result::Result<T, SafekeeperHttpError>;

-pub(crate) trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
+#[async_trait::async_trait]
+pub trait ResponseErrorMessageExt: Sized {
+    async fn error_from_body(self) -> Result<Self>;
 }

+#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
    async fn error_from_body(self) -> Result<Self> {
        let status = self.status();
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -41,8 +41,6 @@ enum Command {
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
-        #[arg(long)]
-        availability_zone_id: String,
    },

    /// Modify a node's configuration in the storage controller
@@ -149,9 +147,9 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
    // outside of the specified set.
-    BulkMigrate {
+    Drain {
        // Set of pageserver node ids to drain.
        #[arg(long)]
        nodes: Vec<NodeId>,
@@ -165,34 +163,6 @@ enum Command {
        #[arg(long)]
        dry_run: Option<bool>,
    },
-    /// Start draining the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartDrain {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel draining the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelDrain {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
-    /// Start filling the specified pageserver.
-    /// The drain is complete when the schedulling policy returns to active.
-    StartFill {
-        #[arg(long)]
-        node_id: NodeId,
-    },
-    /// Cancel filling the specified pageserver and wait for `timeout`
-    /// for the operation to be canceled. May be retried.
-    CancelFill {
-        #[arg(long)]
-        node_id: NodeId,
-        #[arg(long)]
-        timeout: humantime::Duration,
-    },
 }

 #[derive(Parser)]
@@ -279,34 +249,6 @@ impl FromStr for NodeAvailabilityArg {
    }
 }

-async fn wait_for_scheduling_policy<F>(
-    client: Client,
-    node_id: NodeId,
-    timeout: Duration,
-    f: F,
-) -> anyhow::Result<NodeSchedulingPolicy>
-where
-    F: Fn(NodeSchedulingPolicy) -> bool,
-{
-    let waiter = tokio::time::timeout(timeout, async move {
-        loop {
-            let node = client
-                .dispatch::<(), NodeDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/node/{node_id}"),
-                    None,
-                )
-                .await?;
-
-            if f(node.scheduling) {
-                return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
-            }
-        }
-    });
-
-    Ok(waiter.await??)
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
@@ -324,7 +266,6 @@ async fn main() -> anyhow::Result<()> {
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
-            availability_zone_id,
        } => {
            storcon_client
                .dispatch::<_, ()>(
@@ -336,7 +277,6 @@ async fn main() -> anyhow::Result<()> {
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
-                        availability_zone_id: Some(availability_zone_id),
                    }),
                )
                .await?;
@@ -688,7 +628,7 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::BulkMigrate {
+        Command::Drain {
            nodes,
            concurrency,
            max_shards,
@@ -717,7 +657,7 @@ async fn main() -> anyhow::Result<()> {
            }

            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
+                anyhow::bail!("Drain requested for node which doesn't exist.")
            }

            node_to_fill_descs.retain(|desc| {
@@ -729,7 +669,7 @@ async fn main() -> anyhow::Result<()> {
            });

            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to migrate to")
+                anyhow::bail!("There are no nodes to drain to")
            }

            // Set the node scheduling policy to draining for the nodes which
@@ -750,7 +690,7 @@ async fn main() -> anyhow::Result<()> {
                    .await?;
            }

-            // Perform the migration: move each tenant shard scheduled on a node to
+            // Perform the drain: move each tenant shard scheduled on a node to
            // be drained to a node which is being filled. A simple round robin
            // strategy is used to pick the new node.
            let tenants = storcon_client
@@ -763,13 +703,13 @@ async fn main() -> anyhow::Result<()> {

            let mut selected_node_idx = 0;

-            struct MigrationMove {
+            struct DrainMove {
                tenant_shard_id: TenantShardId,
                from: NodeId,
                to: NodeId,
            }

-            let mut moves: Vec<MigrationMove> = Vec::new();
+            let mut moves: Vec<DrainMove> = Vec::new();

            let shards = tenants
                .into_iter()
@@ -799,7 +739,7 @@ async fn main() -> anyhow::Result<()> {
                    continue;
                }

-                moves.push(MigrationMove {
+                moves.push(DrainMove {
                    tenant_shard_id: shard.tenant_shard_id,
                    from: shard
                        .node_attached
@@ -876,67 +816,6 @@ async fn main() -> anyhow::Result<()> {
                failure
            );
        }
-        Command::StartDrain { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-            println!("Drain started for {node_id}");
-        }
-        Command::CancelDrain { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/drain"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active | PauseForRestart)
-                })
-                .await?;
-
-            println!(
-                "Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
-        Command::StartFill { node_id } => {
-            storcon_client
-                .dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
-                .await?;
-
-            println!("Fill started for {node_id}");
-        }
-        Command::CancelFill { node_id, timeout } => {
-            storcon_client
-                .dispatch::<(), ()>(
-                    Method::DELETE,
-                    format!("control/v1/node/{node_id}/fill"),
-                    None,
-                )
-                .await?;
-
-            println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
-
-            let final_policy =
-                wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
-                    use NodeSchedulingPolicy::*;
-                    matches!(sched, Active)
-                })
-                .await?;
-
-            println!(
-                "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
-            );
-        }
    }

    Ok(())
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -3,7 +3,7 @@ set -x

 cd /ext-src || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
       [ -d "${d}" ] || continue
--- a/docs/rfcs/037-storage-controller-restarts.md
+++ b/docs/rfcs/037-storage-controller-restarts.md
@@ -1,259 +0,0 @@
-# Rolling Storage Controller Restarts
-
-## Summary
-
-This RFC describes the issues around the current storage controller restart procedure
-and describes an implementation which reduces downtime to a few milliseconds on the happy path.
-
-## Motivation
-
-Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
-While the storage controller does not sit on the main data path, it's generally not acceptable
-to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
-
-### Current Implementation
-
-The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
-In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
-a new instance is created.
-
-At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
-latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
-under unfavourable circumstances: pageservers are heavily loaded or unavailable.
-
-## Prior Art
-
-There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
-* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
-For fail-over, traffic is routed to one of the standbys (which becomes active).
-* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
-and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
-
-## Requirements
-
-* Reduce storage controller unavailability during upgrades to milliseconds
-* Minimize the interval in which it's possible for more than one storage controller
-to issue reconciles.
-* Have one uniform implementation for restarts and upgrades
-* Fit in with the current Kubernetes deployment scheme
-
-## Non Goals
-
-* Implement our own consensus algorithm from scratch
-* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
-like a transient error to the control plane
-
-## Impacted Components
-
-* storage controller
-* deployment orchestration (i.e. Ansible)
-* helm charts
-
-## Terminology
-
-* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
-at start-up by quering pageservers
-* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
-a set of replicas
-
-## Implementation
-
-### High Level Flow
-
-At a very high level the proposed idea is to start a new storage controller instance while
-the previous one is still running and cut-over to it when it becomes ready. The new instance,
-should coordinate with the existing one and transition responsibility gracefully. While the controller
-has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
-scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
-were operating at the same time and require operator intervention to remedy.
-
-### Kubernetes Deployment Configuration
-
-On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
-to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
-Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
-scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
-
-The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
-
-### Storage Controller Start-Up
-
-This section describes the primitives required on the storage controller side and the flow of the happy path.
-
-#### Database Table For Leader Synchronization
-
-A new table should be added to the storage controller database for leader synchronization during startup.
-This table will always contain at most one row. The proposed name for the table is `leader` and the schema
-contains two elements:
-* `hostname`: represents the hostname for the current storage controller leader - should be addressible
-from other pods in the deployment
-* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
-for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
-
-Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
-at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
-situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
-level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
-READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
-the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
-our needs here.
-
-```
-START TRANSACTION ISOLATION LEVEL REPEATABLE READ
-UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
-WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
-```
-
-If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
-
-#### Step Down API
-
-A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
-request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
-and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
-snapshot of the observed state.
-
-If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
-for failure scenario handling - see [Handling Failures](#handling-failures)).
-
-#### Graceful Restart Happy Path
-
-At start-up, the first thing the storage controller does is retrieve the sole row from the new
-`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
-This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
-observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
-pageservers in order to build up the observed state.
-
-Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
-section. If this step fails, the storage controller process exits.
-
-Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
-(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
-
-Summary of proposed new start-up sequence:
-1. Call `/step_down`
-2. Perform any pending database migrations
-3. Load state from database
-4. Load observed state returned in step (1) into memory
-5. Do initial heartbeat round (may be moved after 5)
-7. Mark self as leader by updating the database
-8. Reschedule and reconcile everything
-
-Some things to note from the steps above:
-* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
-calls to the pageserver and no compute notifications)
-* Ask the current leader to step down before loading state from database so we don't get a lost update
-if the transactions overlap.
-* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
-fall back to asking the pageservers about their current locations.
-* Database migrations should only run **after** the previous instance steps down (or the step down times out).
-
-
-[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
-so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
-
-### Handling Failures
-
-#### Storage Controller Crash Or Restart
-
-The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
-`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
-start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
-exists and consistency is maintained.
-
-#### Previous Leader Crashes Before New Leader Readiness
-
-When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
-reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
-(see [2]).
-
-Now we have two cases to consider:
-* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
-by Kubernetes depending on timings.
-* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
-The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
-create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
-
-[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
-should avoid this self reference and fail the API call at the client if the persisted hostname matches
-the current one.
-
-#### Previous Leader Crashes After New Leader Readiness
-
-The deployment's replica sets already satisfy the deployment's replica count requirements and the
-Kubernetes deployment rollout will just clean up the dead pod.
-
-#### New Leader Crashes Before Pasing Readiness Check
-
-The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
-with the new pod.
-
-#### Network Partition Between New Pod and Previous Leader
-
-This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
-API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
-Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
-
-### Dealing With Split Brain Scenarios
-
-As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
-duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
-scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
-The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
-
-### Ensure Leadership Before Producing Side Effects
-
-The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
-Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
-applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
-
-### Leadership Lease
-
-Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
-to be renewed periodically. Two new columns would be added to the leaders table:
-1. `last_renewed` - timestamp indicating when the lease was last renewed
-2. `lease_duration` - duration indicating the amount of time after which the lease expires
-
-The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
-same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
-to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
-
-### Notify Pageserver Of Storage Controller Term
-
-Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
-Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
-anything which contains a stale term (i.e. smaller than the current one).
-
-### Observability
-
-* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
-Per region alerts should be added on this metric which triggers when:
-  + no storage controller has been in the `Active` state for an extended period of time
-  + more than one storage controllers are in the `Active` state
-
-* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
-We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
-
-## Alternatives
-
-### Kubernetes Leases
-
-Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
-Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
-
-In our case, it would work something like this:
-* `/step_down` deletes the lease or stops it from renewing
-* lease acquisition becomes part of the start-up procedure
-
-The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
-not exactly trivial to implement.
-
-This approach has the benefit of baked in observability (`kubectl describe lease`), but:
-* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
-* More code surface than the simple "row in database" approach. Also, most of this code would be in
-a dependency not subject to code review, etc.
-* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
-so is not simple and complictes and the test set-up.
-
-To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
-to something external.
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -21,21 +21,30 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
 1. Create a new branch based on the stable branch you are updating.

    ```shell
-    git checkout -b my-branch-15 REL_15_STABLE_neon
+    git checkout -b my-branch REL_15_STABLE_neon
    ```

-1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
+1. Tag the last commit on the stable branch you are updating.

-1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.

    ```shell
    git fetch upstream REL_15_4
-    git merge REL_15_4
+    git rebase REL_15_4
    ```

-    In the commit message of the merge commit, mention if there were
-    any non-trivial conflicts or other issues.
-
 1. Run the Postgres test suite to make sure our commits have not affected
 Postgres in a negative way.

@@ -48,7 +57,7 @@ Postgres in a negative way.
 1. Push your branch to the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15
+    git push origin my-branch
    ```

 1. Clone the Neon repository if you have not done so already.
@@ -65,7 +74,7 @@ branch.
 1. Update the Git submodule.

    ```shell
-    git submodule set-branch --branch my-branch-15 vendor/postgres-v15
+    git submodule set-branch --branch my-branch vendor/postgres-v15
    git submodule update --remote vendor/postgres-v15
    ```

@@ -80,12 +89,14 @@ minor Postgres release.

 1. Create a pull request, and wait for CI to go green.

-1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
+1. Force push the rebased Postgres branches into the Neon Postgres repository.

    ```shell
-    git push origin my-branch-15:REL_15_STABLE_neon
+    git push --force origin my-branch:REL_15_STABLE_neon
    ```

+    It may require disabling various branch protections.
+
 1. Update your Neon PR to point at the branches.

    ```shell
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -14,3 +14,5 @@ regex.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
+
+workspace_hack.workspace = true
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -6,8 +6,10 @@ license = "Apache-2.0"

 [dependencies]
 anyhow.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 rand.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -14,3 +14,5 @@ parking_lot.workspace = true
 hex.workspace = true
 scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
+
+workspace_hack.workspace = true
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -12,6 +12,8 @@ chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true

+workspace_hack.workspace = true
+
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -21,9 +21,11 @@ hex.workspace = true
 humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
-chrono = { workspace = true, features = ["serde"] }
+chrono.workspace = true
 itertools.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 bincode.workspace = true
 rand.workspace = true
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -8,7 +8,6 @@ use std::time::{Duration, Instant};
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId};

-use crate::models::PageserverUtilization;
 use crate::{
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
@@ -56,8 +55,6 @@ pub struct NodeRegisterRequest {

    pub listen_http_addr: String,
    pub listen_http_port: u16,
-
-    pub availability_zone_id: Option<String>,
 }

 #[derive(Serialize, Deserialize)]
@@ -143,11 +140,23 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-#[derive(Serialize, Clone, Debug)]
+/// Utilisation score indicating how good a candidate a pageserver
+/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
+/// Lower values are better.
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+pub struct UtilizationScore(pub u64);
+
+impl UtilizationScore {
+    pub fn worst() -> Self {
+        UtilizationScore(u64::MAX)
+    }
+}
+
+#[derive(Serialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(PageserverUtilization),
+    Active(UtilizationScore),
    // Node is warming up, but we expect it to become available soon. Covers
    // the time span between the re-attach response being composed on the storage controller
    // and the first successful heartbeat after the processing of the re-attach response
@@ -186,9 +195,7 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
-            NodeAvailabilityWrapper::Active => {
-                NodeAvailability::Active(PageserverUtilization::full())
-            }
+            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -108,41 +108,14 @@ impl Key {
        }
    }

-    /// This function checks more extensively what keys we can take on the write path.
-    /// If a key beginning with 00 does not have a global/default tablespace OID, it
-    /// will be rejected on the write path.
-    #[allow(dead_code)]
-    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
-        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-        if !self.is_i128_representable() {
-            return false;
-        }
-        if self.field1 == 0
-            && !(self.field2 == GLOBALTABLESPACE_OID
-                || self.field2 == DEFAULTTABLESPACE_OID
-                || self.field2 == 0)
-        {
-            return false; // User defined tablespaces are not supported
-        }
-        true
-    }
-
-    /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
-    /// checks if the key is i128 representable. Note that some keys can be successfully
-    /// ingested into the pageserver, but will cause errors on generating basebackup.
-    pub fn is_valid_key_on_write_path(&self) -> bool {
-        self.is_i128_representable()
-    }
-
-    pub fn is_i128_representable(&self) -> bool {
-        self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
-    }
-
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
-        assert!(self.is_i128_representable(), "invalid key: {self}");
+        assert!(
+            self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
+            "invalid key: {self}",
+        );
        (((self.field1 & 0x7F) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
@@ -263,15 +236,6 @@ impl Key {
        field5: u8::MAX,
        field6: u32::MAX,
    };
-    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
-    pub const NON_L0_MAX: Key = Key {
-        field1: u8::MAX,
-        field2: u32::MAX,
-        field3: u32::MAX,
-        field4: u32::MAX,
-        field5: u8::MAX,
-        field6: u32::MAX - 1,
-    };

    pub fn from_hex(s: &str) -> Result<Self> {
        if s.len() != 36 {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
 use std::{
    collections::HashMap,
    io::{BufRead, Read},
-    num::{NonZeroU32, NonZeroU64, NonZeroUsize},
+    num::{NonZeroU64, NonZeroUsize},
    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
@@ -348,7 +348,7 @@ impl AuxFilePolicy {

    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
    pub fn default_tenant_config() -> Self {
-        Self::V2
+        Self::V1
    }
 }

@@ -486,11 +486,12 @@ pub struct EvictionPolicyLayerAccessThreshold {
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
    pub task_kinds: Vec<String>, // TaskKind
-    pub initial: u32,
+    pub initial: usize,
    #[serde(with = "humantime_serde")]
    pub refill_interval: Duration,
-    pub refill_amount: NonZeroU32,
-    pub max: u32,
+    pub refill_amount: NonZeroUsize,
+    pub max: usize,
+    pub fair: bool,
 }

 impl ThrottleConfig {
@@ -500,8 +501,9 @@ impl ThrottleConfig {
            // other values don't matter with emtpy `task_kinds`.
            initial: 0,
            refill_interval: Duration::from_millis(1),
-            refill_amount: NonZeroU32::new(1).unwrap(),
+            refill_amount: NonZeroUsize::new(1).unwrap(),
            max: 1,
+            fair: true,
        }
    }
    /// The requests per second allowed  by the given config.
@@ -716,7 +718,6 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
-    pub is_archived: bool,

    pub walreceiver_status: String,

@@ -1061,7 +1062,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// A GetPage request contains two LSN values:
+// In the V2 protocol version, a GetPage request contains two LSN values:
 //
 // request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
 // "get the latest version present". It's used by the primary server, which knows that no one else
@@ -1074,7 +1075,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // passing an earlier LSN can speed up the request, by allowing the pageserver to process the
 // request without waiting for 'request_lsn' to arrive.
 //
-// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
 // sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
 // 'latest' was set to true. The V2 interface was added because there was no correct way for a
 // standby to request a page at a particular non-latest LSN, and also include the
@@ -1082,11 +1083,15 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // request, if the standby knows that the page hasn't been modified since, and risk getting an error
 // if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
 // require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
+// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
 // difference in the responses between V1 and V2.
 //
+// The Request structs below reflect the V2 interface. If V1 is used, the parse function
+// maps the old format requests to the new format.
+//
 #[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
+    V1,
    V2,
 }

@@ -1225,17 +1230,36 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;

-        // these two fields are the same for every request type
-        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
+        let (request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V1 => {
+                // In the old protocol, each message starts with a boolean 'latest' flag,
+                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
+                // 'not_modified_since', used in the new protocol version.
+                let latest = body.read_u8()? != 0;
+                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+                if latest {
+                    (Lsn::MAX, request_lsn) // get latest version
+                } else {
+                    (request_lsn, request_lsn) // get version at specified LSN
+                }
+            }
+        };

+        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
                request_lsn,
@@ -1443,7 +1467,9 @@ mod tests {
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
+                    .unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
    pub max_shard_count: u32,

    /// Cached result of [`Self::score`]
-    pub utilization_score: Option<u64>,
+    pub utilization_score: u64,

    /// When was this snapshot captured, pageserver local time.
    ///
@@ -50,8 +50,6 @@ fn unity_percent() -> Percent {
    Percent::new(0).unwrap()
 }

-pub type RawScore = u64;
-
 impl PageserverUtilization {
    const UTILIZATION_FULL: u64 = 1000000;

@@ -64,7 +62,7 @@ impl PageserverUtilization {
    /// - Negative values are forbidden
    /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
    ///   layer eviction.
-    pub fn score(&self) -> RawScore {
+    pub fn score(&self) -> u64 {
        let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
            * self.disk_usable_pct.get() as u64)
            / 100;
@@ -76,30 +74,8 @@ impl PageserverUtilization {
        std::cmp::max(disk_utilization_score, shard_utilization_score)
    }

-    pub fn cached_score(&mut self) -> RawScore {
-        match self.utilization_score {
-            None => {
-                let s = self.score();
-                self.utilization_score = Some(s);
-                s
-            }
-            Some(s) => s,
-        }
-    }
-
-    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
-    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
-    pub fn is_overloaded(score: RawScore) -> bool {
-        score >= Self::UTILIZATION_FULL
-    }
-
-    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
-        if self.shard_count < shard_count {
-            self.shard_count = shard_count;
-
-            // Dirty cache: this will be calculated next time someone retrives the score
-            self.utilization_score = None;
-        }
+    pub fn refresh_score(&mut self) {
+        self.utilization_score = self.score();
    }

    /// A utilization structure that has a full utilization score: use this as a placeholder when
@@ -112,38 +88,7 @@ impl PageserverUtilization {
            disk_usable_pct: Percent::new(100).unwrap(),
            shard_count: 1,
            max_shard_count: 1,
-            utilization_score: Some(Self::UTILIZATION_FULL),
-            captured_at: serde_system_time::SystemTime(SystemTime::now()),
-        }
-    }
-}
-
-/// Test helper
-pub mod test_utilization {
-    use super::PageserverUtilization;
-    use std::time::SystemTime;
-    use utils::{
-        serde_percent::Percent,
-        serde_system_time::{self},
-    };
-
-    // Parameters of the imaginary node used for test utilization instances
-    const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
-    const TEST_SHARDS_MAX: u32 = 1000;
-
-    /// Unit test helper.  Unconditionally compiled because cfg(test) doesn't carry across crates.  Do
-    /// not abuse this function from non-test code.
-    ///
-    /// Emulates a node with a 1000 shard limit and a 1TB disk.
-    pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
-        PageserverUtilization {
-            disk_usage_bytes: disk_wanted_bytes,
-            free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
-            disk_wanted_bytes,
-            disk_usable_pct: Percent::new(100).unwrap(),
-            shard_count,
-            max_shard_count: TEST_SHARDS_MAX,
-            utilization_score: None,
+            utilization_score: Self::UTILIZATION_FULL,
            captured_at: serde_system_time::SystemTime(SystemTime::now()),
        }
    }
@@ -175,7 +120,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            disk_wanted_bytes: u64::MAX,
-            utilization_score: Some(13),
+            utilization_score: 13,
            disk_usable_pct: Percent::new(90).unwrap(),
            shard_count: 100,
            max_shard_count: 200,
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -18,6 +18,7 @@ tokio-rustls.workspace = true
 tracing.workspace = true

 pq_proto.workspace = true
+workspace_hack.workspace = true

 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_connection/Cargo.toml
+++ b/libs/postgres_connection/Cargo.toml
@@ -11,5 +11,7 @@ postgres.workspace = true
 tokio-postgres.workspace = true
 url.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 once_cell.workspace = true
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,6 +19,8 @@ thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -136,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
+pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
-pub use v14::xlog_utils::try_from_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;

 pub use v14::bindings::DBState_DB_SHUTDOWNED;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -135,8 +135,6 @@ pub fn get_current_timestamp() -> TimestampTz {
 mod timestamp_conversions {
    use std::time::Duration;

-    use anyhow::Context;
-
    use super::*;

    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
@@ -156,18 +154,18 @@ mod timestamp_conversions {
        }
    }

-    pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
+    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
        let time: u64 = time
            .try_into()
-            .context("timestamp before millenium (postgres epoch)")?;
+            .expect("timestamp before millenium (postgres epoch)");
        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
        SystemTime::UNIX_EPOCH
            .checked_add(Duration::from_micros(since_unix_epoch))
-            .context("SystemTime overflow")
+            .expect("SystemTime overflow")
    }
 }

-pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
+pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};

 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
@@ -547,14 +545,14 @@ mod tests {
    #[test]
    fn test_ts_conversion() {
        let now = SystemTime::now();
-        let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
+        let round_trip = from_pg_timestamp(to_pg_timestamp(now));

        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());

        let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
+        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));

        assert_eq!(now_pg, round_trip_pg);
    }
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -14,6 +14,8 @@ postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true

+workspace_hack.workspace = true
+
 [dev-dependencies]
 regex.workspace = true
 utils.workspace = true
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,7 +11,9 @@ itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["io-util"] }
+tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -32,7 +32,7 @@ scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
-
+workspace_hack.workspace = true
 azure_core.workspace = true
 azure_identity.workspace = true
 azure_storage.workspace = true
@@ -46,4 +46,3 @@ sync_wrapper = { workspace = true, features = ["futures"] }
 camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
-tokio = { workspace = true, features = ["test-util"] }
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,3 +9,5 @@ serde.workspace = true
 serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -9,3 +9,5 @@ license.workspace = true
 anyhow.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,3 +14,5 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -14,6 +14,7 @@ testing = ["fail/failpoints"]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
+async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
@@ -25,6 +26,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -37,7 +39,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
-toml_edit = { workspace = true, features = ["serde"] }
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -52,6 +54,7 @@ walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
 metrics.workspace = true
+workspace_hack.workspace = true

 const_format.workspace = true

@@ -68,7 +71,6 @@ criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
 serde_assert.workspace = true
-tokio = { workspace = true, features = ["test-util"] }

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -1,280 +0,0 @@
-//! This module implements the Generic Cell Rate Algorithm for a simplified
-//! version of the Leaky Bucket rate limiting system.
-//!
-//! # Leaky Bucket
-//!
-//! If the bucket is full, no new requests are allowed and are throttled/errored.
-//! If the bucket is partially full/empty, new requests are added to the bucket in
-//! terms of "tokens".
-//!
-//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
-//!
-//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
-//!
-//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
-//!
-//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
-//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
-//!
-//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
-//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
-//!
-//! Another explaination can be found here: <https://brandur.org/rate-limiting>
-
-use std::{sync::Mutex, time::Duration};
-
-use tokio::{sync::Notify, time::Instant};
-
-pub struct LeakyBucketConfig {
-    /// This is the "time cost" of a single request unit.
-    /// Should loosely represent how long it takes to handle a request unit in active resource time.
-    /// Loosely speaking this is the inverse of the steady-rate requests-per-second
-    pub cost: Duration,
-
-    /// total size of the bucket
-    pub bucket_width: Duration,
-}
-
-impl LeakyBucketConfig {
-    pub fn new(rps: f64, bucket_size: f64) -> Self {
-        let cost = Duration::from_secs_f64(rps.recip());
-        let bucket_width = cost.mul_f64(bucket_size);
-        Self { cost, bucket_width }
-    }
-}
-
-pub struct LeakyBucketState {
-    /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
-    ///
-    /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
-    /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
-    /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
-    /// Draining the bucket will happen naturally as `now` moves forward.
-    ///
-    /// Let `n` be some "time cost" for the request,
-    /// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
-    /// If now is within the `bucket window + n`, we are within time budget.
-    /// If now is before the `bucket window + n`, we have run out of budget.
-    ///
-    /// This is inspired by the generic cell rate algorithm (GCRA) and works
-    /// exactly the same as a leaky-bucket.
-    pub empty_at: Instant,
-}
-
-impl LeakyBucketState {
-    pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
-        LeakyBucketState {
-            empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
-        }
-    }
-
-    pub fn bucket_is_empty(&self, now: Instant) -> bool {
-        // if self.end is after now, the bucket is not empty
-        self.empty_at <= now
-    }
-
-    /// Immediately adds tokens to the bucket, if there is space.
-    ///
-    /// In a scenario where you are waiting for available rate,
-    /// rather than just erroring immediately, `started` corresponds to when this waiting started.
-    ///
-    /// `n` is the number of tokens that will be filled in the bucket.
-    ///
-    /// # Errors
-    ///
-    /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
-    /// there will be space again.
-    pub fn add_tokens(
-        &mut self,
-        config: &LeakyBucketConfig,
-        started: Instant,
-        n: f64,
-    ) -> Result<(), Instant> {
-        let now = Instant::now();
-
-        // invariant: started <= now
-        debug_assert!(started <= now);
-
-        // If the bucket was empty when we started our search,
-        // we should update the `empty_at` value accordingly.
-        // this prevents us from having negative tokens in the bucket.
-        let mut empty_at = self.empty_at;
-        if empty_at < started {
-            empty_at = started;
-        }
-
-        let n = config.cost.mul_f64(n);
-        let new_empty_at = empty_at + n;
-        let allow_at = new_empty_at.checked_sub(config.bucket_width);
-
-        //                     empty_at
-        //          allow_at    |   new_empty_at
-        //           /          |   /
-        // -------o-[---------o-|--]---------
-        //   now1 ^      now2 ^
-        //
-        // at now1, the bucket would be completely filled if we add n tokens.
-        // at now2, the bucket would be partially filled if we add n tokens.
-
-        match allow_at {
-            Some(allow_at) if now < allow_at => Err(allow_at),
-            _ => {
-                self.empty_at = new_empty_at;
-                Ok(())
-            }
-        }
-    }
-}
-
-pub struct RateLimiter {
-    pub config: LeakyBucketConfig,
-    pub state: Mutex<LeakyBucketState>,
-    /// a queue to provide this fair ordering.
-    pub queue: Notify,
-}
-
-struct Requeue<'a>(&'a Notify);
-
-impl Drop for Requeue<'_> {
-    fn drop(&mut self) {
-        self.0.notify_one();
-    }
-}
-
-impl RateLimiter {
-    pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
-        RateLimiter {
-            state: Mutex::new(LeakyBucketState::with_initial_tokens(
-                &config,
-                initial_tokens,
-            )),
-            config,
-            queue: {
-                let queue = Notify::new();
-                queue.notify_one();
-                queue
-            },
-        }
-    }
-
-    pub fn steady_rps(&self) -> f64 {
-        self.config.cost.as_secs_f64().recip()
-    }
-
-    /// returns true if we did throttle
-    pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
-        let start = tokio::time::Instant::now();
-
-        // wait until we are the first in the queue
-        let mut notified = std::pin::pin!(self.queue.notified());
-        if !notified.as_mut().enable() {
-            throttled = true;
-            notified.await;
-        }
-
-        // notify the next waiter in the queue when we are done.
-        let _guard = Requeue(&self.queue);
-
-        loop {
-            let res = self
-                .state
-                .lock()
-                .unwrap()
-                .add_tokens(&self.config, start, count as f64);
-            match res {
-                Ok(()) => return throttled,
-                Err(ready_at) => {
-                    throttled = true;
-                    tokio::time::sleep_until(ready_at).await;
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::time::Duration;
-
-    use tokio::time::Instant;
-
-    use super::{LeakyBucketConfig, LeakyBucketState};
-
-    #[tokio::test(start_paused = true)]
-    async fn check() {
-        let config = LeakyBucketConfig {
-            // average 100rps
-            cost: Duration::from_millis(10),
-            // burst up to 100 requests
-            bucket_width: Duration::from_millis(1000),
-        };
-
-        let mut state = LeakyBucketState {
-            empty_at: Instant::now(),
-        };
-
-        // supports burst
-        {
-            // should work for 100 requests this instant
-            for _ in 0..100 {
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-
-        // doesn't overfill
-        {
-            // after 1s we should have an empty bucket again.
-            tokio::time::advance(Duration::from_secs(1)).await;
-            assert!(state.bucket_is_empty(Instant::now()));
-
-            // after 1s more, we should not over count the tokens and allow more than 200 requests.
-            tokio::time::advance(Duration::from_secs(1)).await;
-            for _ in 0..100 {
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-
-        // supports sustained rate over a long period
-        {
-            tokio::time::advance(Duration::from_secs(1)).await;
-
-            // should sustain 100rps
-            for _ in 0..2000 {
-                tokio::time::advance(Duration::from_millis(10)).await;
-                state.add_tokens(&config, Instant::now(), 1.0).unwrap();
-            }
-        }
-
-        // supports requesting more tokens than can be stored in the bucket
-        // we just wait a little bit longer upfront.
-        {
-            // start the bucket completely empty
-            tokio::time::advance(Duration::from_secs(5)).await;
-            assert!(state.bucket_is_empty(Instant::now()));
-
-            // requesting 200 tokens of space should take 200*cost = 2s
-            // but we already have 1s available, so we wait 1s from start.
-            let start = Instant::now();
-
-            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_secs(1));
-
-            tokio::time::advance(Duration::from_millis(500)).await;
-            let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(500));
-
-            tokio::time::advance(Duration::from_millis(500)).await;
-            state.add_tokens(&config, start, 200.0).unwrap();
-
-            // bucket should be completely full now
-            let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
-            assert_eq!(ready - Instant::now(), Duration::from_millis(10));
-        }
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -71,7 +71,6 @@ pub mod postgres_client;

 pub mod tracing_span_assert;

-pub mod leaky_bucket;
 pub mod rate_limit;

 /// Simple once-barrier and a guard which keeps barrier awaiting.
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,15 +5,6 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
    last: Option<Instant>,
    interval: Duration,
-    dropped: u64,
-}
-
-pub struct RateLimitStats(u64);
-
-impl std::fmt::Display for RateLimitStats {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{} dropped calls", self.0)
-    }
 }

 impl RateLimit {
@@ -21,27 +12,20 @@ impl RateLimit {
        Self {
            last: None,
            interval,
-            dropped: 0,
        }
    }

    /// Call `f` if the rate limit allows.
    /// Don't call it otherwise.
    pub fn call<F: FnOnce()>(&mut self, f: F) {
-        self.call2(|_| f())
-    }
-
-    pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
        let now = Instant::now();
        match self.last {
            Some(last) if now - last <= self.interval => {
                // ratelimit
-                self.dropped += 1;
            }
            _ => {
                self.last = Some(now);
-                f(RateLimitStats(self.dropped));
-                self.dropped = 0;
+                f();
            }
        }
    }
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -9,6 +9,8 @@ anyhow.workspace = true
 utils.workspace = true
 postgres_ffi.workspace = true

+workspace_hack.workspace = true
+
 [build-dependencies]
 anyhow.workspace = true
 bindgen.workspace = true
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -95,7 +95,6 @@ fn main() -> anyhow::Result<()> {
        .allowlist_var("ERROR")
        .allowlist_var("FATAL")
        .allowlist_var("PANIC")
-        .allowlist_var("PG_VERSION_NUM")
        .allowlist_var("WPEVENT")
        .allowlist_var("WL_LATCH_SET")
        .allowlist_var("WL_SOCKET_READABLE")
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -282,11 +282,7 @@ mod tests {
    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;

-    use crate::{
-        api_bindings::Level,
-        bindings::{NeonWALReadResult, PG_VERSION_NUM},
-        walproposer::Wrapper,
-    };
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};

    use super::ApiImpl;

@@ -493,79 +489,41 @@ mod tests {

        let (sender, receiver) = sync_channel(1);

-        // Messages definitions are at walproposer.h
-        // xxx: it would be better to extract them from safekeeper crate and
-        // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
-        let proposer_greeting = [
-            greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
-            tenant_id.as_slice(),
-            timeline_id.as_slice(),
-            pg_tli.as_slice(),
-            wal_seg_size.as_slice(),
-        ]
-        .concat();
-
-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let vote_request = [
-            voting_tag.as_slice(),
-            vote_request_term.as_slice(),
-            proposer_id.as_slice(),
-        ]
-        .concat();
-
-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
-        let acceptor_greeting = [
-            greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
-            acceptor_greeting_node_id.as_slice(),
-        ]
-        .concat();
-
-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
-        let vote_response = [
-            voting_tag.as_slice(),
-            vote_response_term.as_slice(),
-            vote_given.as_slice(),
-            flush_lsn.as_slice(),
-            truncate_lsn.as_slice(),
-            th_len.as_slice(),
-            th_term.as_slice(),
-            th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
-        ]
-        .concat();
-
        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
            wait_events: Cell::new(WaitEventsData {
                sk: std::ptr::null_mut(),
                event_mask: 0,
            }),
-            expected_messages: vec![proposer_greeting, vote_request],
+            expected_messages: vec![
+                // TODO: When updating Postgres versions, this test will cause
+                // problems. Postgres version in message needs updating.
+                //
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![acceptor_greeting, vote_response],
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -16,7 +16,6 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
-bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
@@ -37,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -52,7 +52,6 @@ rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
-send-future.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -4,13 +4,12 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{criterion_group, criterion_main, Criterion};
 use pageserver::{
-    config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
+    config::PageServerConf,
    context::{DownloadBehavior, RequestContext},
    l0_flush::{L0FlushConfig, L0FlushGlobalState},
    page_cache,
    repository::Value,
    task_mgr::TaskKind,
-    tenant::storage_layer::inmemory_layer::SerializedBatch,
    tenant::storage_layer::InMemoryLayer,
    virtual_file,
 };
@@ -68,16 +67,12 @@ async fn ingest(
    let layer =
        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;

-    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
-    let data_ser_size = data.serialized_size().unwrap() as usize;
+    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
    );

-    const BATCH_SIZE: usize = 16;
-    let mut batch = Vec::new();
-
    for i in 0..put_count {
        lsn += put_size as u64;

@@ -100,17 +95,7 @@ async fn ingest(
            }
        }

-        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
-        if batch.len() >= BATCH_SIZE {
-            let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch).unwrap();
-            layer.put_batch(serialized, &ctx).await?;
-        }
-    }
-    if !batch.is_empty() {
-        let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch).unwrap();
-        layer.put_batch(serialized, &ctx).await?;
+        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
    }
    layer.freeze(lsn + 1).await;

@@ -164,11 +149,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
    page_cache::init(conf.page_cache_size);

    {
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
+async-trait.workspace = true
 reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -506,16 +506,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    /// Configs io buffer alignment at runtime.
-    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
-        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, align)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -4,7 +4,6 @@

 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -145,11 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -3,7 +3,6 @@ use std::path::{Path, PathBuf};
 use anyhow::Result;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
-use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
@@ -60,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
@@ -90,7 +89,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
-        assert!(k.is_i128_representable(), "invalid key: ");
    }
    // TODO(chi): special handling for last key?
    Ok(())
@@ -191,11 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -20,7 +20,6 @@ use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
-    config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
    context::{DownloadBehavior, RequestContext},
    page_cache,
    task_mgr::TaskKind,
@@ -206,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -58,11 +58,6 @@ pub(crate) struct Args {
    /// [`pageserver_api::models::virtual_file::IoEngineKind`].
    #[clap(long)]
    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
-
-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
-    #[clap(long)]
-    set_io_alignment: Option<usize>,
-
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -129,10 +124,6 @@ async fn main_impl(
        mgmt_api_client.put_io_engine(engine_str).await?;
    }

-    if let Some(align) = args.set_io_alignment {
-        mgmt_api_client.put_io_alignment(align).await?;
-    }
-
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/assert_u64_eq_usize.rs
+++ b/pageserver/src/assert_u64_eq_usize.rs
@@ -1,39 +0,0 @@
-//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
-
-pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
-    if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
-        panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
-    }
-};
-
-pub(crate) trait U64IsUsize {
-    fn into_usize(self) -> usize;
-}
-
-impl U64IsUsize for u64 {
-    #[inline(always)]
-    fn into_usize(self) -> usize {
-        #[allow(clippy::let_unit_value)]
-        let _ = _ASSERT_U64_EQ_USIZE;
-        self as usize
-    }
-}
-
-pub(crate) trait UsizeIsU64 {
-    fn into_u64(self) -> u64;
-}
-
-impl UsizeIsU64 for usize {
-    #[inline(always)]
-    fn into_u64(self) -> u64 {
-        #[allow(clippy::let_unit_value)]
-        let _ = _ASSERT_U64_EQ_USIZE;
-        self as u64
-    }
-}
-
-pub const fn u64_to_usize(x: u64) -> usize {
-    #[allow(clippy::let_unit_value)]
-    let _ = _ASSERT_U64_EQ_USIZE;
-    x as usize
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,69 +125,18 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
-    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

-    // The tenants directory contains all the pageserver local disk state.
-    // Create if not exists and make sure all the contents are durable before proceeding.
-    // Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
-    // After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
-    // Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
    let tenants_path = conf.tenants_path();
-    {
-        let open = || {
-            nix::dir::Dir::open(
-                tenants_path.as_std_path(),
-                nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
-                nix::sys::stat::Mode::empty(),
-            )
-        };
-        let dirfd = match open() {
-            Ok(dirfd) => dirfd,
-            Err(e) => match e {
-                nix::errno::Errno::ENOENT => {
-                    utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
-                        format!("Failed to create tenants root dir at '{tenants_path}'")
-                    })?;
-                    open().context("open tenants dir after creating it")?
-                }
-                e => anyhow::bail!(e),
-            },
-        };
-
-        let started = Instant::now();
-        // Linux guarantees durability for syncfs.
-        // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
-        #[cfg(target_os = "linux")]
-        {
-            use std::os::fd::AsRawFd;
-            nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
-        }
-        #[cfg(target_os = "macos")]
-        {
-            // macOS is not a production platform for Neon, don't even bother.
-            drop(dirfd);
-        }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
-        {
-            compile_error!("Unsupported OS");
-        }
-
-        let elapsed = started.elapsed();
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "made tenant directory contents durable"
-        );
+    if !tenants_path.exists() {
+        utils::crashsafe::create_dir_all(conf.tenants_path())
+            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
    }

    // Initialize up failpoints support
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.io_buffer_alignment,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,6 @@ use utils::{

 use crate::l0_flush::L0FlushConfig;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -96,8 +95,6 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

-    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -292,8 +289,6 @@ pub struct PageServerConf {

    /// Direct IO settings
    pub virtual_file_direct_io: virtual_file::DirectIoMode,
-
-    pub io_buffer_alignment: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -398,8 +393,6 @@ struct PageServerConfigBuilder {
    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,

    virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
-
-    io_buffer_alignment: BuilderValue<usize>,
 }

 impl PageServerConfigBuilder {
@@ -488,7 +481,6 @@ impl PageServerConfigBuilder {
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
            virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
-            io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
        }
    }
 }
@@ -668,10 +660,6 @@ impl PageServerConfigBuilder {
        self.virtual_file_direct_io = BuilderValue::Set(value);
    }

-    pub fn io_buffer_alignment(&mut self, value: usize) {
-        self.io_buffer_alignment = BuilderValue::Set(value);
-    }
-
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -728,7 +716,6 @@ impl PageServerConfigBuilder {
                l0_flush,
                compact_level0_phase1_value_access,
                virtual_file_direct_io,
-                io_buffer_alignment,
            }
            CUSTOM LOGIC
            {
@@ -998,9 +985,6 @@ impl PageServerConf {
                "virtual_file_direct_io" => {
                    builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
                }
-                "io_buffer_alignment" => {
-                    builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1021,15 +1005,6 @@ impl PageServerConf {

        conf.default_tenant_conf = t_conf.merge(TenantConf::default());

-        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
-            .map_err(|msg| anyhow::anyhow!("{msg}"))
-            .with_context(|| {
-                format!(
-                    "effective checkpoint distance is unsupported: {}",
-                    conf.default_tenant_conf.checkpoint_distance
-                )
-            })?;
-
        Ok(conf)
    }

@@ -1093,7 +1068,6 @@ impl PageServerConf {
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-            io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
        }
    }
 }
@@ -1334,7 +1308,6 @@ background_task_maximum_delay = '334 s'
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1408,7 +1381,6 @@ background_task_maximum_delay = '334 s'
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
                virtual_file_direct_io: virtual_file::DirectIoMode::default(),
-                io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -141,18 +141,12 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                        m.other
                    );

-                    let az_id = m
-                        .other
-                        .get("availability_zone_id")
-                        .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
-
                    Some(NodeRegisterRequest {
                        node_id: conf.id,
                        listen_pg_addr: m.postgres_host,
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
-                        availability_zone_id: az_id,
                    })
                }
                Err(e) => {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -318,27 +318,6 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
    }
 }

-impl From<crate::tenant::TimelineArchivalError> for ApiError {
-    fn from(value: crate::tenant::TimelineArchivalError) -> Self {
-        use crate::tenant::TimelineArchivalError::*;
-        match value {
-            NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
-            Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
-            e @ HasArchivedParent(_) => {
-                ApiError::PreconditionFailed(e.to_string().into_boxed_str())
-            }
-            HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
-                format!(
-                    "Cannot archive timeline which has non-archived child timelines: {children:?}"
-                )
-                .into_boxed_str(),
-            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
-            Other(e) => ApiError::InternalServerError(e),
-        }
-    }
-}
-
 impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
        use crate::tenant::mgr::DeleteTimelineError::*;
@@ -426,8 +405,6 @@ async fn build_timeline_info_common(
    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
-    // Report is_archived = false if the timeline is still loading
-    let is_archived = timeline.is_archived().unwrap_or(false);
    let remote_consistent_lsn_projected = timeline
        .get_remote_consistent_lsn_projected()
        .unwrap_or(Lsn(0));
@@ -468,7 +445,6 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
-        is_archived,

        walreceiver_status,

@@ -710,7 +686,9 @@ async fn timeline_archival_config_handler(

        tenant
            .apply_timeline_archival_config(timeline_id, request_data.state)
-            .await?;
+            .await
+            .context("applying archival config")
+            .map_err(ApiError::InternalServerError)?;
        Ok::<_, ApiError>(())
    }
    .instrument(info_span!("timeline_archival_config",
@@ -874,10 +852,7 @@ async fn get_timestamp_of_lsn_handler(

    match result {
        Some(time) => {
-            let time = format_rfc3339(
-                postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
-            )
-            .to_string();
+            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
            json_response(StatusCode::OK, time)
        }
        None => Err(ApiError::NotFound(
@@ -1731,12 +1706,13 @@ async fn timeline_compact_handler(
        flags |= CompactFlags::ForceImageLayerCreation;
    }
    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
    }
-    if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
-        flags |= CompactFlags::DryRun;
-    }
-
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -2354,20 +2330,6 @@ async fn put_io_engine_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_alignment_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let align: usize = json_request(&mut r).await?;
-    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
-        ApiError::PreconditionFailed(
-            format!("Requested io alignment ({align}) is not a power of two").into(),
-        )
-    })?;
-    json_response(StatusCode::OK, ())
-}
-
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -2980,7 +2942,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
-            |r| api_handler(r, timeline_compact_handler),
+            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
@@ -3055,9 +3017,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put("/v1/io_alignment", |r| {
-            api_handler(r, put_io_alignment_handler)
-        })
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -16,7 +16,6 @@ pub mod l0_flush;
 use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
-mod assert_u64_eq_usize;
 pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
@@ -89,8 +88,6 @@ pub async fn shutdown_pageserver(
 ) {
    use std::time::Duration;

-    let started_at = std::time::Instant::now();
-
    // If the orderly shutdown below takes too long, we still want to make
    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
    //
@@ -244,10 +241,7 @@ pub async fn shutdown_pageserver(
    walredo_extraordinary_shutdown_thread.join().unwrap();
    info!("walredo_extraordinary_shutdown_thread done");

-    info!(
-        elapsed_ms = started_at.elapsed().as_millis(),
-        "Shut down successfully completed"
-    );
+    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1552,6 +1552,7 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
    PageStreamV2,
+    PageStream,
    Basebackup,
    Fullbackup,
    LeaseLsn,
@@ -1802,14 +1803,6 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_utilization_score",
-        "The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_secondary_heatmap_total_size",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -557,7 +557,7 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        _protocol_version: PagestreamProtocolVersion,
+        protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -601,7 +601,8 @@ impl PageServerHandler {
            fail::fail_point!("ps::handle-pagerequest-message");

            // parse request
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+            let neon_fe_msg =
+                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;

            // invoke handler function
            let (handler_result, span) = match neon_fe_msg {
@@ -753,21 +754,16 @@ impl PageServerHandler {
        }

        if request_lsn < **latest_gc_cutoff_lsn {
-            let gc_info = &timeline.gc_info.read().unwrap();
-            if !gc_info.leases.contains_key(&request_lsn) {
-                // The requested LSN is below gc cutoff and is not guarded by a lease.
-
-                // Check explicitly for INVALID just to get a less scary error message if the
-                // request is obviously bogus
-                return Err(if request_lsn == Lsn::INVALID {
-                    PageStreamError::BadRequest("invalid LSN(0) in request".into())
-                } else {
-                    PageStreamError::BadRequest(format!(
+            // Check explicitly for INVALID just to get a less scary error message if the
+            // request is obviously bogus
+            return Err(if request_lsn == Lsn::INVALID {
+                PageStreamError::BadRequest("invalid LSN(0) in request".into())
+            } else {
+                PageStreamError::BadRequest(format!(
                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
                        request_lsn, **latest_gc_cutoff_lsn
                    ).into())
-                });
-            }
+            });
        }

        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
@@ -794,8 +790,6 @@ impl PageServerHandler {
        }
    }

-    /// Handles the lsn lease request.
-    /// If a lease cannot be obtained, the client will receive NULL.
    #[instrument(skip_all, fields(shard_id, %lsn))]
    async fn handle_make_lsn_lease<IO>(
        &mut self,
@@ -818,25 +812,19 @@ impl PageServerHandler {
            .await?;
        set_tracing_field_shard_id(&timeline);

-        let lease = timeline
-            .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
-            .inspect_err(|e| {
-                warn!("{e}");
-            })
-            .ok();
-        let valid_until_str = lease.map(|l| {
-            l.valid_until
-                .duration_since(SystemTime::UNIX_EPOCH)
-                .expect("valid_until is earlier than UNIX_EPOCH")
-                .as_millis()
-                .to_string()
-        });
-        let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
+        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
+        let valid_until = lease
+            .valid_until
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map_err(|e| QueryError::Other(e.into()))?;

        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
            b"valid_until",
        )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
+        .write_message_noflush(&BeMessage::DataRow(&[Some(
+            &valid_until.as_millis().to_be_bytes(),
+        )]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;

        Ok(())
    }
@@ -1287,6 +1275,35 @@ where
                ctx,
            )
            .await?;
+        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
+            if params.len() != 2 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number for pagestream command"
+                )));
+            }
+            let tenant_id = TenantId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_id))?;
+
+            COMPUTE_COMMANDS_COUNTERS
+                .for_command(ComputeCommandKind::PageStream)
+                .inc();
+
+            self.handle_pagerequests(
+                pgb,
+                tenant_id,
+                timeline_id,
+                PagestreamProtocolVersion::V1,
+                ctx,
+            )
+            .await?;
        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,14 +12,15 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
-use anyhow::{bail, ensure, Context};
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -36,6 +37,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
+use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -172,7 +174,6 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
            lsn,
        }
    }
@@ -726,17 +727,7 @@ impl Timeline {
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
        let current_policy = self.last_aux_file_policy.load();
        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
-                self.list_aux_files_v1(lsn, ctx).await
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
            Some(AuxFilePolicy::CrossValidation) => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
@@ -1031,33 +1022,21 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
-
-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
-    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
-    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
-    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
-    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
-
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

-    pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
-    }
-
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1597,7 +1576,6 @@ impl<'a> DatadirModification<'a> {
                if aux_files_key_v1.is_empty() {
                    None
                } else {
-                    warn!("this timeline is using deprecated aux file policy V1");
                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
                    Some(AuxFilePolicy::V1)
                }
@@ -1791,30 +1769,21 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            if !key.is_valid_key_on_write_path() {
-                bail!(
-                    "the request contains data not supported by pageserver at TimelineWriter::put: {}", key
-                );
-            }
-            let mut write_batch = Vec::new();
-            for (lsn, value_ser_size, value) in values {
+            for (lsn, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
+                    writer.put(key, lsn, &value, ctx).await?;
                } else {
-                    retained_pending_updates.entry(key).or_default().push((
-                        lsn,
-                        value_ser_size,
-                        value,
-                    ));
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
                }
            }
-            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
-        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1840,23 +1809,17 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // Ordering: the items in this batch do not need to be in any global order, but values for
-            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-            // this to do efficient updates to its index.
-            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
-                .pending_updates
-                .drain()
-                .flat_map(|(key, values)| {
-                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
-                        if !key.is_valid_key_on_write_path() {
-                            bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
-                        }
-                        Ok((key.to_compact(), lsn, val_ser_size, value))
-                    })
-                })
-                .collect::<anyhow::Result<Vec<_>>>()?;
+            // The put_batch call below expects expects the inputs to be sorted by Lsn,
+            // so we do that first.
+            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
+                self.pending_updates
+                    .drain()
+                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
+                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
+                VecMapOrdering::GreaterOrEqual,
+            );

-            writer.put_batch(batch, ctx).await?;
+            writer.put_batch(lsn_ordered_batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1881,8 +1844,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        self.pending_bytes = 0;
-
        Ok(())
    }

@@ -1899,7 +1860,7 @@ impl<'a> DatadirModification<'a> {
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, _, value)) = values.last() {
+            if let Some((_, value)) = values.last() {
                return if let Value::Image(img) = value {
                    Ok(img.clone())
                } else {
@@ -1927,17 +1888,13 @@ impl<'a> DatadirModification<'a> {
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
-                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                *last_value = val;
                return;
            }
        }
-
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-        self.pending_bytes += val_serialized_size;
-        values.push((self.lsn, val_serialized_size, val));
+        values.push((self.lsn, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
@@ -2067,7 +2024,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -146,12 +146,6 @@ impl FromStr for TokioRuntimeMode {
    }
 }

-static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
-    env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
-        // the default 2MiB are insufficent, especially in debug mode
-        .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
-});
-
 static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
    let thread_name = "pageserver-tokio";
    let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
@@ -170,7 +164,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
            tokio::runtime::Builder::new_current_thread()
                .thread_name(thread_name)
                .enable_all()
-                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                .build()
                .expect("failed to create one single runtime")
        }
@@ -180,7 +173,6 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
                .thread_name(thread_name)
                .enable_all()
                .worker_threads(num_workers.get())
-                .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                .build()
                .expect("failed to create one multi-threaded runtime")
        }
@@ -207,7 +199,6 @@ macro_rules! pageserver_runtime {
                    .thread_name($name)
                    .worker_threads(TOKIO_WORKER_THREADS.get())
                    .enable_all()
-                    .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
                    .build()
                    .expect(std::concat!("Failed to create runtime ", $name))
            });
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -501,42 +501,6 @@ impl Debug for DeleteTimelineError {
    }
 }

-#[derive(thiserror::Error)]
-pub enum TimelineArchivalError {
-    #[error("NotFound")]
-    NotFound,
-
-    #[error("Timeout")]
-    Timeout,
-
-    #[error("ancestor is archived: {}", .0)]
-    HasArchivedParent(TimelineId),
-
-    #[error("HasUnarchivedChildren")]
-    HasUnarchivedChildren(Vec<TimelineId>),
-
-    #[error("Timeline archival is already in progress")]
-    AlreadyInProgress,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl Debug for TimelineArchivalError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::NotFound => write!(f, "NotFound"),
-            Self::Timeout => write!(f, "Timeout"),
-            Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
-            Self::HasUnarchivedChildren(c) => {
-                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
-            }
-            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
-            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
-        }
-    }
-}
-
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
@@ -881,12 +845,6 @@ impl Tenant {
                        });
                    };

-                // TODO: should also be rejecting tenant conf changes that violate this check.
-                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
-                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    return Ok(());
-                }
-
                let mut init_order = init_order;
                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
@@ -1368,59 +1326,24 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        state: TimelineArchivalState,
-    ) -> Result<(), TimelineArchivalError> {
-        info!("setting timeline archival config");
-        let timeline = {
-            let timelines = self.timelines.lock().unwrap();
-
-            let Some(timeline) = timelines.get(&timeline_id) else {
-                return Err(TimelineArchivalError::NotFound);
-            };
-
-            if state == TimelineArchivalState::Unarchived {
-                if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
-                    if ancestor_timeline.is_archived() == Some(true) {
-                        return Err(TimelineArchivalError::HasArchivedParent(
-                            ancestor_timeline.timeline_id,
-                        ));
-                    }
-                }
-            }
-
-            // Ensure that there are no non-archived child timelines
-            let children: Vec<TimelineId> = timelines
-                .iter()
-                .filter_map(|(id, entry)| {
-                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
-                        return None;
-                    }
-                    if entry.is_archived() == Some(true) {
-                        return None;
-                    }
-                    Some(*id)
-                })
-                .collect();
-
-            if !children.is_empty() && state == TimelineArchivalState::Archived {
-                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
-            }
-            Arc::clone(timeline)
-        };
+    ) -> anyhow::Result<()> {
+        let timeline = self
+            .get_timeline(timeline_id, false)
+            .context("Cannot apply timeline archival config to inexistent timeline")?;

        let upload_needed = timeline
            .remote_client
            .schedule_index_upload_for_timeline_archival_state(state)?;

        if upload_needed {
-            info!("Uploading new state");
            const MAX_WAIT: Duration = Duration::from_secs(10);
            let Ok(v) =
                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
            else {
                tracing::warn!("reached timeout for waiting on upload queue");
-                return Err(TimelineArchivalError::Timeout);
+                bail!("reached timeout for upload queue flush");
            };
-            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
+            v?;
        }
        Ok(())
    }
@@ -3818,21 +3741,13 @@ impl Tenant {
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
-    ///
-    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
-    /// than they report here, due to layer eviction.  Tenants with many active branches may
-    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
+        let mut wanted = 0;
        let timelines = self.timelines.lock().unwrap();
-
-        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
-        // reflects the observation that on tenants with multiple large branches, typically only one
-        // of them is used actively enough to occupy space on disk.
-        timelines
-            .values()
-            .map(|t| t.metrics.visible_physical_size_gauge.get())
-            .max()
-            .unwrap_or(0)
+        for timeline in timelines.values() {
+            wanted += timeline.metrics.visible_physical_size_gauge.get();
+        }
+        wanted
    }
 }

@@ -6017,10 +5932,10 @@ mod tests {
            .await
            .unwrap();

-        // the default aux file policy to switch is v2 if not set by the admins
+        // the default aux file policy to switch is v1 if not set by the admins
        assert_eq!(
            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
+            AuxFilePolicy::V1
        );
        let (tenant, ctx) = harness.load().await;

@@ -6064,8 +5979,8 @@ mod tests {
        );
        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
        );

        // we can read everything from the storage
@@ -6087,8 +6002,8 @@ mod tests {

        assert_eq!(
            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
        );

        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
@@ -6104,7 +6019,7 @@ mod tests {

        // child copies the last flag even if that is not on remote storage yet
        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));

        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(files.get("pg_logical/mappings/test1"), None);
@@ -7090,14 +7005,18 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
+                    key_range: {
+                        let mut key = Key::MAX;
+                        key.field6 -= 1;
+                        Key::MIN..key
+                    },
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
-                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
+                // The delta layer that is cut in the middle
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::NON_L0_MAX,
-                    lsn_range: Lsn(0x30)..Lsn(0x48),
+                    key_range: get_key(3)..get_key(4),
+                    lsn_range: Lsn(0x30)..Lsn(0x41),
                    is_delta: true
                },
                // The delta3 layer that should not be picked for the compaction
@@ -8077,214 +7996,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
-    {
-        let harness =
-            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
-                .await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(1),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x38),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    // delta1 and delta 2 only contain a single key but multiple updates
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-        {
-            // Update GC info
-            let mut guard = tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
-                ],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
-                    space: Lsn(0x30),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_20 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let expected_result_at_lsn_10 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10"),
-            Bytes::from_static(b"value 2@0x10"),
-            Bytes::from_static(b"value 3@0x10"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10"),
-            Bytes::from_static(b"value 6@0x10"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            let gc_horizon = {
-                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
-            };
-            for idx in 0..10 {
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), gc_horizon, &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_20[idx]
-                );
-                assert_eq!(
-                    tline
-                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_10[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        let mut dryrun_flags = EnumSet::new();
-        dryrun_flags.insert(CompactFlags::DryRun);
-
-        tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
-            .await
-            .unwrap();
-        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
-        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
-        verify_result().await;
-
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        // compact again
-        tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
-            .await
-            .unwrap();
-        verify_result().await;
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;

 /// The maximum size of blobs we support. The highest few bits
 /// are reserved for compression and other further uses.
-pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;

 pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
            } else {
                // Write a 4-byte length header
-                if len > MAX_SUPPORTED_BLOB_LEN {
+                if len > MAX_SUPPORTED_LEN {
                    return (
                        (
                            io_buf.slice_len(),
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,6 +2,7 @@
 //! Low-level Block-oriented I/O functions
 //!

+use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -80,7 +81,9 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
    FileBlockReader(&'a FileBlockReader<'a>),
+    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
+    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
    #[cfg(test)]
@@ -97,7 +100,9 @@ impl<'a> BlockReaderRef<'a> {
        use BlockReaderRef::*;
        match self {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
+            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
            Adapter(r) => r.read_blk(blknum, ctx).await,
+            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -106,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
    }
 }

+impl<'a> BlockReaderRef<'a> {
+    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
+        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
+        let end = start.checked_add(PAGE_SZ).unwrap();
+        if end > slice.len() {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                format!("slice too short, len={} end={}", slice.len(), end),
+            ));
+        }
+        let slice = &slice[start..end];
+        let page_sized: &[u8; PAGE_SZ] = slice
+            .try_into()
+            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
+        Ok(BlockLease::Slice(page_sized))
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -1,21 +1,13 @@
 //! Implementation of append-only file data structure
 //! used to keep in-memory layers spilled on disk.

-use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
-use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
-use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
-use bytes::BytesMut;
+use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
-use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
-use tokio_epoll_uring::{BoundedBuf, Slice};
-use tracing::error;

 use std::io;
 use std::sync::atomic::AtomicU64;
@@ -24,17 +16,12 @@ use utils::id::TimelineId;
 pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-    page_cache_file_id: page_cache::FileId,
-    bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
-    _gate_guard: utils::sync::gate::GateGuard,
+
+    rw: page_caching::RW,
 }

-const TAIL_SZ: usize = 64 * 1024;
+mod page_caching;
+mod zero_padded_read_write;

 impl EphemeralFile {
    pub async fn create(
@@ -64,178 +51,60 @@ impl EphemeralFile {
        )
        .await?;

-        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
-
        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            page_cache_file_id,
-            bytes_written: 0,
-            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
-            ),
-            _gate_guard: gate_guard,
+            rw: page_caching::RW::new(file, gate_guard),
        })
    }
-}

-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
-        }
-    }
-}
-
-impl EphemeralFile {
    pub(crate) fn len(&self) -> u64 {
-        self.bytes_written
+        self.rw.bytes_written()
    }

    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
+        self.rw.page_cache_file_id()
    }

+    /// See [`self::page_caching::RW::load_to_vec`].
    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        let size = self.len().into_usize();
-        let vec = Vec::with_capacity(size);
-        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
-        assert_eq!(nread, size);
-        let vec = slice.into_inner();
-        assert_eq!(vec.len(), nread);
-        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
-        Ok(vec)
+        self.rw.load_to_vec(ctx).await
    }

-    /// Returns the offset at which the first byte of the input was written, for use
-    /// in constructing indices over the written value.
-    ///
-    /// Panics if the write is short because there's no way we can recover from that.
-    /// TODO: make upstack handle this as an error.
-    pub(crate) async fn write_raw(
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        self.rw.read_blk(blknum, ctx).await
+    }
+
+    pub(crate) async fn write_blob(
        &mut self,
        srcbuf: &[u8],
        ctx: &RequestContext,
-    ) -> std::io::Result<u64> {
-        let pos = self.bytes_written;
+    ) -> Result<u64, io::Error> {
+        let pos = self.rw.bytes_written();

-        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
-            std::io::Error::new(
-                std::io::ErrorKind::Other,
-                format!(
-                    "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
-                    srcbuf_len = srcbuf.len(),
-                ),
-            )
-        })?;
+        // Write the length field
+        if srcbuf.len() < 0x80 {
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];
+
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        } else {
+            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
+            len_buf[0] |= 0x80;
+            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+        }

        // Write the payload
-        let nwritten = self
-            .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
-            .await?;
-        assert_eq!(
-            nwritten,
-            srcbuf.len(),
-            "buffered writer has no short writes"
-        );
-
-        self.bytes_written = new_bytes_written;
+        self.rw.write_all_borrowed(srcbuf, ctx).await?;

        Ok(pos)
    }
 }

-impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
-        &'b self,
-        start: u64,
-        dst: tokio_epoll_uring::Slice<B>,
-        ctx: &'a RequestContext,
-    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
-
-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
-
-        let dst_cap = dst.bytes_total().into_u64();
-        let end = {
-            // saturating_add is correct here because the max file size is u64::MAX, so,
-            // if start + dst.len() > u64::MAX, then we know it will be a short read
-            let mut end: u64 = start.saturating_add(dst_cap);
-            if end > self.bytes_written {
-                end = self.bytes_written;
-            }
-            end
-        };
-
-        // inclusive, exclusive
-        #[derive(Debug)]
-        struct Range<N>(N, N);
-        impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
-            fn len(&self) -> N {
-                if self.0 > self.1 {
-                    N::zero()
-                } else {
-                    self.1 - self.0
-                }
-            }
-        }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
-
-        let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
-            let bounds = dst.bounds();
-            let slice = file
-                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
-                .await?;
-            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
-        } else {
-            dst
-        };
-
-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
-                .0
-                .checked_sub(flushed_offset)
-                .unwrap()
-                .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
-            let bounds = dst.bounds();
-            let mut view = dst.slice({
-                let start = written_range.len().into_usize();
-                let end = start
-                    .checked_add(buffered_range.len().into_usize())
-                    .unwrap();
-                start..end
-            });
-            view.as_mut_rust_slice_full_zeroed()
-                .copy_from_slice(to_copy);
-            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
-        } else {
-            dst
-        };
-
-        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
-
-        Ok((dst, (end - start).into_usize()))
-    }
-}
-
 /// Does the given filename look like an ephemeral file?
 pub fn is_ephemeral_file(filename: &str) -> bool {
    if let Some(rest) = filename.strip_prefix("ephemeral-") {
@@ -245,13 +114,19 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl BlockReader for EphemeralFile {
+    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
+        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use rand::Rng;
-
    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
+    use crate::tenant::block_io::BlockReaderRef;
+    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -282,6 +157,69 @@ mod tests {
        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }

+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
+
+        let gate = utils::sync::gate::Gate::default();
+
+        let entered = gate.enter().unwrap();
+
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
+
+        let pos_foo = file.write_blob(b"foo", &ctx).await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
+        );
+        let pos_bar = file.write_blob(b"bar", &ctx).await?;
+        assert_eq!(
+            b"foo",
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor()
+                .read_blob(pos_bar, &ctx)
+                .await?
+                .as_slice()
+        );
+
+        let mut blobs = Vec::new();
+        for i in 0..10000 {
+            let data = Vec::from(format!("blob{}", i).as_bytes());
+            let pos = file.write_blob(&data, &ctx).await?;
+            blobs.push((pos, data));
+        }
+        // also test with a large blobs
+        for i in 0..100 {
+            let data = format!("blob{}", i).as_bytes().repeat(100);
+            let pos = file.write_blob(&data, &ctx).await?;
+            blobs.push((pos, data));
+        }
+
+        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
+        for (pos, expected) in blobs {
+            let actual = cursor.read_blob(pos, &ctx).await?;
+            assert_eq!(actual, expected);
+        }
+
+        // Test a large blob that spans multiple pages
+        let mut large_data = vec![0; 20000];
+        thread_rng().fill_bytes(&mut large_data);
+        let pos_large = file.write_blob(&large_data, &ctx).await?;
+        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
+        assert_eq!(result, large_data);
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn ephemeral_file_holds_gate_open() {
        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
@@ -315,151 +253,4 @@ mod tests {
            .expect("closing completes right away")
            .expect("closing does not panic");
    }
-
-    #[tokio::test]
-    async fn test_ephemeral_file_basics() {
-        let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let write_nbytes = cap + cap / 2;
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(write_nbytes)
-            .collect();
-
-        let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
-            value_offsets.push(off);
-        }
-
-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = Vec::with_capacity(1);
-            let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
-                .await
-                .unwrap();
-            let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
-        }
-
-        let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
-
-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
-    }
-
-    #[tokio::test]
-    async fn test_flushes_do_happen() {
-        let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
-            .collect();
-
-        file.write_raw(&content, &ctx).await.unwrap();
-
-        // assert the state is as this test expects it to be
-        assert_eq!(
-            &file.load_to_vec(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
-        );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path
-            .metadata()
-            .unwrap();
-        assert_eq!(
-            md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
-        );
-        assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
-        );
-    }
-
-    #[tokio::test]
-    async fn test_read_split_across_file_and_buffer() {
-        // This test exercises the logic on the read path that splits the logical read
-        // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
-        //
-        // This test build on the assertions in test_flushes_do_happen
-
-        let (conf, tenant_id, timeline_id, ctx) =
-            harness("test_read_split_across_file_and_buffer").unwrap();
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
-
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
-            .collect();
-
-        file.write_raw(&content, &ctx).await.unwrap();
-
-        let test_read = |start: usize, len: usize| {
-            let file = &file;
-            let ctx = &ctx;
-            let content = &content;
-            async move {
-                let (buf, nread) = file
-                    .read_exact_at_eof_ok(
-                        start.into_u64(),
-                        Vec::with_capacity(len).slice_full(),
-                        ctx,
-                    )
-                    .await
-                    .unwrap();
-                assert_eq!(nread, len);
-                assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
-            }
-        };
-
-        // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
-    }
 }
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -0,0 +1,153 @@
+//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
+//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
+//!
+//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
+
+use crate::context::RequestContext;
+use crate::page_cache::{self, PAGE_SZ};
+use crate::tenant::block_io::BlockLease;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::VirtualFile;
+
+use std::io::{self};
+use tokio_epoll_uring::BoundedBuf;
+use tracing::*;
+
+use super::zero_padded_read_write;
+
+/// See module-level comment.
+pub struct RW {
+    page_cache_file_id: page_cache::FileId,
+    rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl RW {
+    pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
+        let page_cache_file_id = page_cache::next_file_id();
+        Self {
+            page_cache_file_id,
+            rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
+            _gate_guard,
+        }
+    }
+
+    pub fn page_cache_file_id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
+    }
+
+    pub(crate) async fn write_all_borrowed(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<usize, io::Error> {
+        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
+        // because Compute is unlikely to access recently written data.
+        self.rw.write_all_borrowed(srcbuf, ctx).await
+    }
+
+    pub(crate) fn bytes_written(&self) -> u64 {
+        self.rw.bytes_written()
+    }
+
+    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
+    ///
+    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
+    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
+    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        // round up to the next PAGE_SZ multiple, required by blob_io
+        let size = {
+            let s = usize::try_from(self.bytes_written()).unwrap();
+            if s % PAGE_SZ == 0 {
+                s
+            } else {
+                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
+            }
+        };
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let file_size_tracking_writer = self.rw.as_writer();
+        let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
+        let mut vec = file_size_tracking_writer
+            .as_inner()
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffered = self.rw.get_tail_zero_padded();
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        assert_eq!(vec.len() % PAGE_SZ, 0);
+        Ok(vec)
+    }
+
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
+        match self.rw.read_blk(blknum).await? {
+            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
+                let cache = page_cache::get();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.rw.as_writer().as_inner().path,
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(write_guard) => {
+                        let write_guard = writer
+                            .as_inner()
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
+                            .await?;
+                        let read_guard = write_guard.mark_valid();
+                        return Ok(BlockLease::PageReadGuard(read_guard));
+                    }
+                }
+            }
+            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
+                Ok(BlockLease::EphemeralFileMutableTail(buffer))
+            }
+        }
+    }
+}
+
+impl Drop for RW {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        // we are clear to do this, because we have entered a gate
+        let path = &self.rw.as_writer().as_inner().path;
+        let res = std::fs::remove_file(path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!("could not remove ephemeral file '{path}': {e}");
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -0,0 +1,145 @@
+//! The heart of how [`super::EphemeralFile`] does its reads and writes.
+//!
+//! # Writes
+//!
+//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
+//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
+//!
+//! # Reads
+//!
+//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
+//!
+//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
+//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
+//! if the read is for the prefix that has already been flushed.
+//!
+//! # Current Usage
+//!
+//! The current user of this module is [`super::page_caching::RW`].
+
+mod zero_padded;
+
+use crate::{
+    context::RequestContext,
+    page_cache::PAGE_SZ,
+    virtual_file::owned_buffers_io::{
+        self,
+        write::{Buffer, OwnedAsyncWriter},
+    },
+};
+
+const TAIL_SZ: usize = 64 * 1024;
+
+/// See module-level comment.
+pub struct RW<W: OwnedAsyncWriter> {
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        zero_padded::Buffer<TAIL_SZ>,
+        owned_buffers_io::util::size_tracking_writer::Writer<W>,
+    >,
+}
+
+pub enum ReadResult<'a, W> {
+    NeedsReadFromWriter { writer: &'a W },
+    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
+}
+
+impl<W> RW<W>
+where
+    W: OwnedAsyncWriter,
+{
+    pub fn new(writer: W) -> Self {
+        let bytes_flushed_tracker =
+            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
+        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
+            bytes_flushed_tracker,
+            zero_padded::Buffer::default(),
+        );
+        Self { buffered_writer }
+    }
+
+    pub(crate) fn as_writer(&self) -> &W {
+        self.buffered_writer.as_inner().as_inner()
+    }
+
+    pub async fn write_all_borrowed(
+        &mut self,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<usize> {
+        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
+    }
+
+    pub fn bytes_written(&self) -> u64 {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        flushed_offset + u64::try_from(buffer.pending()).unwrap()
+    }
+
+    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
+    pub fn get_tail_zero_padded(&self) -> &[u8] {
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffer_written_up_to = buffer.pending();
+        // pad to next page boundary
+        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
+            buffer_written_up_to
+        } else {
+            buffer_written_up_to
+                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
+                .unwrap()
+        };
+        &buffer.as_zero_padded_slice()[0..read_up_to]
+    }
+
+    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
+        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
+        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
+
+        // The trailing page ("block") might only be partially filled,
+        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
+        // Moreover, it has to be zero-padded, because when we still had
+        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
+        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
+        // => check here that the read doesn't go beyond this potentially trailing
+        // => the zero-padding is done in the `else` branch below
+        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
+            buffered_offset / (PAGE_SZ as u64)
+        } else {
+            (buffered_offset / (PAGE_SZ as u64)) + 1
+        };
+        if (blknum as u64) >= blocks_written {
+            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
+        }
+
+        // assertions for the `if-else` below
+        assert_eq!(
+            flushed_offset % (TAIL_SZ as u64), 0,
+            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
+        );
+        assert_eq!(
+            flushed_offset % (PAGE_SZ as u64),
+            0,
+            "the logic below can't handle if the page is spread across the flushed part and the buffer"
+        );
+
+        if read_offset < flushed_offset {
+            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
+            Ok(ReadResult::NeedsReadFromWriter {
+                writer: self.as_writer(),
+            })
+        } else {
+            let read_offset_in_buffer = read_offset
+                .checked_sub(flushed_offset)
+                .expect("would have taken `if` branch instead of this one");
+            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
+            let zero_padded_slice = buffer.as_zero_padded_slice();
+            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
+            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
+                buffer: page
+                    .try_into()
+                    .expect("the slice above got it as page-size slice"),
+            })
+        }
+    }
+}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -0,0 +1,110 @@
+//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
+//! unwritten range is guaranteed to be zero-initialized.
+//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
+//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
+
+use std::mem::MaybeUninit;
+
+use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
+
+/// See module-level comment.
+pub struct Buffer<const N: usize> {
+    allocation: Box<[u8; N]>,
+    written: usize,
+}
+
+impl<const N: usize> Default for Buffer<N> {
+    fn default() -> Self {
+        Self {
+            allocation: Box::new(
+                // SAFETY: zeroed memory is a valid [u8; N]
+                unsafe { MaybeUninit::zeroed().assume_init() },
+            ),
+            written: 0,
+        }
+    }
+}
+
+impl<const N: usize> Buffer<N> {
+    #[inline(always)]
+    fn invariants(&self) {
+        // don't check by default, unoptimized is too expensive even for debug mode
+        if false {
+            debug_assert!(self.written <= N, "{}", self.written);
+            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
+        }
+    }
+
+    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
+        &self.allocation
+    }
+}
+
+impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
+    type IoBuf = Self;
+
+    fn cap(&self) -> usize {
+        self.allocation.len()
+    }
+
+    fn extend_from_slice(&mut self, other: &[u8]) {
+        self.invariants();
+        let remaining = self.allocation.len() - self.written;
+        if other.len() > remaining {
+            panic!("calling extend_from_slice() with insufficient remaining capacity");
+        }
+        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
+        self.written += other.len();
+        self.invariants();
+    }
+
+    fn pending(&self) -> usize {
+        self.written
+    }
+
+    fn flush(self) -> FullSlice<Self> {
+        self.invariants();
+        let written = self.written;
+        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
+    }
+
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let Self {
+            mut allocation,
+            written,
+        } = iobuf;
+        allocation[0..written].fill(0);
+        let new = Self {
+            allocation,
+            written: 0,
+        };
+        new.invariants();
+        new
+    }
+}
+
+/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
+/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
+///
+/// Remember that bytes_init is generally _not_ a tracker of the amount
+/// of valid data in the io buffer; we use `Slice` for that.
+/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
+///
+/// SAFETY:
+///
+/// The [`Self::allocation`] is stable becauses boxes are stable.
+/// The memory is zero-initialized, so, bytes_init is always N.
+unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.allocation.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        // Yes, N, not self.written; Read the full comment of this impl block!
+        N
+    }
+
+    fn bytes_total(&self) -> usize {
+        N
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -464,7 +464,7 @@ impl LayerMap {
    pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        // TODO: See #3869, resulting #4088, attempted fix and repro #4094

-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            self.l0_delta_layers.push(layer_desc.clone().into());
        }

@@ -483,7 +483,7 @@ impl LayerMap {
        self.historic
            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
+        if Self::is_l0(&layer_desc.key_range) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -600,8 +600,8 @@ impl LayerMap {
    }

    /// Check if the key range resembles that of an L0 layer.
-    pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
-        is_delta_layer && key_range == &(Key::MIN..Key::MAX)
+    pub fn is_l0(key_range: &Range<Key>) -> bool {
+        key_range == &(Key::MIN..Key::MAX)
    }

    /// This function determines which layers are counted in `count_deltas`:
@@ -628,7 +628,7 @@ impl LayerMap {
    ///      than just the current partition_range.
    pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
        // Case 1
-        if !Self::is_l0(&layer.key_range, layer.is_delta) {
+        if !Self::is_l0(&layer.key_range) {
            return true;
        }

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,12 +2,13 @@

 pub mod delta_layer;
 pub mod image_layer;
-pub mod inmemory_layer;
+pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

+#[cfg(test)]
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,11 +36,10 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{
    DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
 };
-use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
-    VectoredReadCoalesceMode, VectoredReadPlanner,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
@@ -65,7 +64,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
-use tokio_epoll_uring::IoBuf;
+use tokio_epoll_uring::IoBufMut;
 use tracing::*;

 use utils::{
@@ -233,18 +232,6 @@ pub struct DeltaLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl DeltaLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "delta {}..{} {}..{}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn_range().start,
-            self.lsn_range().end
-        )
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -471,7 +458,7 @@ impl DeltaLayerWriterInner {
        ctx: &RequestContext,
    ) -> (FullSlice<Buf>, anyhow::Result<()>)
    where
-        Buf: IoBuf + Send,
+        Buf: IoBufMut + Send,
    {
        assert!(
            self.lsn_range.start <= lsn,
@@ -569,6 +556,7 @@ impl DeltaLayerWriterInner {
        // 5GB limit for objects without multipart upload (which we don't want to use)
        // Make it a little bit below to account for differing GB units
        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
+        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
@@ -678,7 +666,7 @@ impl DeltaLayerWriter {
        ctx: &RequestContext,
    ) -> (FullSlice<Buf>, anyhow::Result<()>)
    where
-        Buf: IoBuf + Send,
+        Buf: IoBufMut + Send,
    {
        self.inner
            .as_mut()
@@ -702,10 +690,12 @@ impl DeltaLayerWriter {
        self.inner.take().unwrap().finish(key_end, ctx).await
    }

+    #[cfg(test)]
    pub(crate) fn num_keys(&self) -> usize {
        self.inner.as_ref().unwrap().num_keys
    }

+    #[cfg(test)]
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
@@ -1205,7 +1195,6 @@ impl DeltaLayerInner {
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

        let mut read_builder: Option<VectoredReadBuilder> = None;
-        let read_mode = VectoredReadCoalesceMode::get();

        let max_read_size = self
            .max_vectored_read_bytes
@@ -1254,7 +1243,6 @@ impl DeltaLayerInner {
                        offsets.end.pos(),
                        meta,
                        max_read_size,
-                        read_mode,
                    ))
                }
            } else {
@@ -1539,10 +1527,6 @@ pub struct DeltaLayerIterator<'a> {
 }

 impl<'a> DeltaLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.delta_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
@@ -2297,7 +2281,7 @@ pub(crate) mod test {
                        // every key should be a batch b/c the value is larger than max_read_size
                        assert_eq!(iter.key_values_batch.len(), 1);
                    } else {
-                        assert!(iter.key_values_batch.len() <= batch_size);
+                        assert_eq!(iter.key_values_batch.len(), batch_size);
                    }
                    if num_items >= N {
                        break;
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -167,17 +167,6 @@ pub struct ImageLayerInner {
    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

-impl ImageLayerInner {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        format!(
-            "image {}..{} {}",
-            self.key_range().start,
-            self.key_range().end,
-            self.lsn()
-        )
-    }
-}
-
 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
@@ -716,6 +705,10 @@ struct ImageLayerWriterInner {
 }

 impl ImageLayerWriterInner {
+    fn size(&self) -> u64 {
+        self.tree.borrow_writer().size() + self.blob_writer.size()
+    }
+
    ///
    /// Start building a new image layer.
    ///
@@ -850,19 +843,13 @@ impl ImageLayerWriterInner {
            res?;
        }

-        let final_key_range = if let Some(end_key) = end_key {
-            self.key_range.start..end_key
-        } else {
-            self.key_range.clone()
-        };
-
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
-            key_range: final_key_range.clone(),
+            key_range: self.key_range.clone(),
            lsn: self.lsn,
            index_start_blk,
            index_root_blk,
@@ -883,7 +870,11 @@ impl ImageLayerWriterInner {
        let desc = PersistentLayerDesc::new_img(
            self.tenant_shard_id,
            self.timeline_id,
-            final_key_range,
+            if let Some(end_key) = end_key {
+                self.key_range.start..end_key
+            } else {
+                self.key_range.clone()
+            },
            self.lsn,
            metadata.len(),
        );
@@ -972,12 +963,14 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

+    #[cfg(test)]
    /// Estimated size of the image layer.
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
        inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
    }

+    #[cfg(test)]
    pub(crate) fn num_keys(&self) -> usize {
        self.inner.as_ref().unwrap().num_keys
    }
@@ -993,6 +986,7 @@ impl ImageLayerWriter {
        self.inner.take().unwrap().finish(timeline, ctx, None).await
    }

+    #[cfg(test)]
    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
@@ -1006,6 +1000,10 @@ impl ImageLayerWriter {
            .finish(timeline, ctx, Some(end_key))
            .await
    }
+
+    pub(crate) fn size(&self) -> u64 {
+        self.inner.as_ref().unwrap().size()
+    }
 }

 impl Drop for ImageLayerWriter {
@@ -1026,10 +1024,6 @@ pub struct ImageLayerIterator<'a> {
 }

 impl<'a> ImageLayerIterator<'a> {
-    pub(crate) fn layer_dbg_info(&self) -> String {
-        self.image_layer.layer_dbg_info()
-    }
-
    /// Retrieve a batch of key-value pairs into the iterator buffer.
    async fn next_batch(&mut self) -> anyhow::Result<()> {
        assert!(self.key_values_batch.is_empty());
@@ -1381,7 +1375,7 @@ mod test {
                        // every key should be a batch b/c the value is larger than max_read_size
                        assert_eq!(iter.key_values_batch.len(), 1);
                    } else {
-                        assert!(iter.key_values_batch.len() <= batch_size);
+                        assert_eq!(iter.key_values_batch.len(), batch_size);
                    }
                    if num_items >= N {
                        break;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -4,23 +4,23 @@
 //! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
 //! its position in the file, is kept in memory, though.
 //!
-use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
+use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Context, Result};
-use bytes::Bytes;
+use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
@@ -33,14 +33,12 @@ use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{
    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
 };

-pub(crate) mod vectored_dio_read;
-
 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub(crate) struct InMemoryLayerFileId(page_cache::FileId);

@@ -80,9 +78,9 @@ impl std::fmt::Debug for InMemoryLayer {

 pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The [`IndexEntry`] is an offset into the
+    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -92,154 +90,6 @@ pub struct InMemoryLayerInner {
    resource_units: GlobalResourceUnits,
 }

-/// Support the same max blob length as blob_io, because ultimately
-/// all the InMemoryLayer contents end up being written into a delta layer,
-/// using the [`crate::tenant::blob_io`].
-const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
-const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
-    let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
-    let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
-    assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
-    trailing_ones
-};
-
-/// See [`InMemoryLayerInner::index`].
-///
-/// For memory efficiency, the data is packed into a u64.
-///
-/// Layout:
-/// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub struct IndexEntry(u64);
-
-impl IndexEntry {
-    /// See [`Self::MAX_SUPPORTED_POS`].
-    const MAX_SUPPORTED_POS_BITS: usize = {
-        let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
-        if remainder < 32 {
-            panic!("pos can be u32 as per type system, support that");
-        }
-        remainder
-    };
-    /// The maximum supported blob offset that can be represented by [`Self`].
-    /// See also [`Self::validate_checkpoint_distance`].
-    const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
-
-    // Layout
-    const WILL_INIT_RANGE: Range<usize> = 0..1;
-    const LEN_RANGE: Range<usize> =
-        Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
-    const POS_RANGE: Range<usize> =
-        Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
-    const _ASSERT: () = {
-        if Self::POS_RANGE.end != 64 {
-            panic!("we don't want undefined bits for our own sanity")
-        }
-    };
-
-    /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
-    ///
-    /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
-    /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
-    ///
-    /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
-    /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
-    ///
-    /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
-    /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
-    #[inline(always)]
-    fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
-        let IndexEntryNewArgs {
-            base_offset,
-            batch_offset,
-            len,
-            will_init,
-        } = arg;
-
-        let pos = base_offset
-            .checked_add(batch_offset)
-            .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
-
-        if pos.into_usize() > Self::MAX_SUPPORTED_POS {
-            anyhow::bail!(
-                "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
-                max = Self::MAX_SUPPORTED_POS
-            );
-        }
-
-        if len > MAX_SUPPORTED_BLOB_LEN {
-            anyhow::bail!(
-                "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
-            );
-        }
-
-        let mut data: u64 = 0;
-        use bit_field::BitField;
-        data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
-        data.set_bits(Self::LEN_RANGE, len.into_u64());
-        data.set_bits(Self::POS_RANGE, pos);
-
-        Ok(Self(data))
-    }
-
-    #[inline(always)]
-    fn unpack(&self) -> IndexEntryUnpacked {
-        use bit_field::BitField;
-        IndexEntryUnpacked {
-            will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
-            len: self.0.get_bits(Self::LEN_RANGE),
-            pos: self.0.get_bits(Self::POS_RANGE),
-        }
-    }
-
-    /// See [`Self::new`].
-    pub(crate) const fn validate_checkpoint_distance(
-        checkpoint_distance: u64,
-    ) -> Result<(), &'static str> {
-        if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
-            return Err("exceeds the maximum supported value");
-        }
-        let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
-        if res.is_none() {
-            return Err(
-                "checkpoint distance + max supported blob len overflows in-memory addition",
-            );
-        }
-
-        // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
-
-        Ok(())
-    }
-
-    const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
-        let res = Self::validate_checkpoint_distance(
-            crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
-        );
-        if res.is_err() {
-            panic!("default checkpoint distance is valid")
-        }
-    };
-}
-
-/// Args to [`IndexEntry::new`].
-#[derive(Clone, Copy)]
-struct IndexEntryNewArgs {
-    base_offset: u64,
-    batch_offset: u64,
-    len: usize,
-    will_init: bool,
-}
-
-/// Unpacked representation of the bitfielded [`IndexEntry`].
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-struct IndexEntryUnpacked {
-    will_init: bool,
-    len: u64,
-    pos: u64,
-}
-
 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayerInner").finish()
@@ -426,12 +276,7 @@ impl InMemoryLayer {
            .build();

        let inner = self.inner.read().await;
-
-        struct ValueRead {
-            entry_lsn: Lsn,
-            read: vectored_dio_read::LogicalRead<Vec<u8>>,
-        }
-        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
+        let reader = inner.file.block_cursor();

        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
@@ -446,62 +291,24 @@ impl InMemoryLayer {

                let slice = vec_map.slice_range(lsn_range);

-                for (entry_lsn, index_entry) in slice.iter().rev() {
-                    let IndexEntryUnpacked {
-                        pos,
-                        len,
-                        will_init,
-                    } = index_entry.unpack();
-                    reads.entry(key).or_default().push(ValueRead {
-                        entry_lsn: *entry_lsn,
-                        read: vectored_dio_read::LogicalRead::new(
-                            pos,
-                            Vec::with_capacity(len as usize),
-                        ),
-                    });
-                    if will_init {
+                for (entry_lsn, pos) in slice.iter().rev() {
+                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
+                    let buf = reader.read_blob(*pos, &ctx).await;
+                    if let Err(e) = buf {
+                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
                        break;
                    }
-                }
-            }
-        }

-        // Execute the reads.
-
-        let f = vectored_dio_read::execute(
-            &inner.file,
-            reads
-                .iter()
-                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-            &ctx,
-        );
-        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-            .await;
-
-        // Process results into the reconstruct state
-        'next_key: for (key, value_reads) in reads {
-            for ValueRead { entry_lsn, read } in value_reads {
-                match read.into_result().expect("we run execute() above") {
-                    Err(e) => {
+                    let value = Value::des(&buf.unwrap());
+                    if let Err(e) = value {
                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        continue 'next_key;
+                        break;
                    }
-                    Ok(value_buf) => {
-                        let value = Value::des(&value_buf);
-                        if let Err(e) = value {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            continue 'next_key;
-                        }

-                        let key_situation =
-                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
-                        if key_situation == ValueReconstructSituation::Complete {
-                            // TODO: metric to see if we fetched more values than necessary
-                            continue 'next_key;
-                        }
-
-                        // process the next value in the next iteration of the loop
+                    let key_situation =
+                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
+                    if key_situation == ValueReconstructSituation::Complete {
+                        break;
                    }
                }
            }
@@ -513,68 +320,6 @@ impl InMemoryLayer {
    }
 }

-/// Offset of a particular Value within a serialized batch.
-struct SerializedBatchOffset {
-    key: CompactKey,
-    lsn: Lsn,
-    // TODO: separate type when we start serde-serializing this value, to avoid coupling
-    // in-memory representation to serialization format.
-    index_entry: IndexEntry,
-}
-
-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    offsets: Vec<SerializedBatchOffset>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
-        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
-        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
-
-        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        for (key, lsn, val_ser_size, val) in batch {
-            let relative_off = cursor.position();
-
-            val.ser_into(&mut cursor)
-                .expect("Writing into in-memory buffer is infallible");
-
-            offsets.push(SerializedBatchOffset {
-                key,
-                lsn,
-                index_entry: IndexEntry::new(IndexEntryNewArgs {
-                    base_offset: 0,
-                    batch_offset: relative_off,
-                    len: val_ser_size,
-                    will_init: val.will_init(),
-                })
-                .context("higher-level code ensures that values are within supported ranges")?,
-            });
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        let buffer = cursor.into_inner();
-
-        // Assert that we didn't do any extra allocations while building buffer.
-        debug_assert!(buffer.len() <= buffer_size);
-
-        Ok(Self {
-            raw: buffer,
-            offsets,
-            max_lsn,
-        })
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -635,69 +380,53 @@ impl InMemoryLayer {
        })
    }

-    /// Write path.
-    ///
-    /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
-    /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
-    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
-    pub async fn put_batch(
+    // Write operations
+
+    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
+    /// Adds the page version to the in-memory tree
+    pub async fn put_value(
        &self,
-        serialized_batch: SerializedBatch,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+    }

-        let base_offset = inner.file.len();
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: CompactKey,
+        lsn: Lsn,
+        buf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

-        let SerializedBatch {
-            raw,
-            mut offsets,
-            max_lsn: _,
-        } = serialized_batch;
+        let off = {
+            locked_inner
+                .file
+                .write_blob(
+                    buf,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::InMemoryLayer)
+                        .build(),
+                )
+                .await?
+        };

-        // Add the base_offset to the batch's index entries which are relative to the batch start.
-        for offset in &mut offsets {
-            let IndexEntryUnpacked {
-                will_init,
-                len,
-                pos,
-            } = offset.index_entry.unpack();
-            offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
-                base_offset,
-                batch_offset: pos,
-                len: len.into_usize(),
-                will_init,
-            })?;
+        let vec_map = locked_inner.index.entry(key).or_default();
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        if old.is_some() {
+            // We already had an entry for this LSN. That's odd..
+            warn!("Key {} at {} already exists", key, lsn);
        }

-        // Write the batch to the file
-        inner.file.write_raw(&raw, ctx).await?;
-        let new_size = inner.file.len();
-        let expected_new_len = base_offset
-            .checked_add(raw.len().into_u64())
-            // write_raw would error if we were to overflow u64.
-            // also IndexEntry and higher levels in
-            //the code don't allow the file to grow that large
-            .unwrap();
-        assert_eq!(new_size, expected_new_len);
-
-        // Update the index with the new entries
-        for SerializedBatchOffset {
-            key,
-            lsn,
-            index_entry,
-        } in offsets
-        {
-            let vec_map = inner.index.entry(key).or_default();
-            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
-            if old.is_some() {
-                // We already had an entry for this LSN. That's odd..
-                warn!("Key {} at {} already exists", key, lsn);
-            }
-        }
-
-        inner.resource_units.maybe_publish_size(new_size);
+        let size = locked_inner.file.len();
+        locked_inner.resource_units.maybe_publish_size(size);

        Ok(())
    }
@@ -741,7 +470,7 @@ impl InMemoryLayer {
        {
            let inner = self.inner.write().await;
            for vec_map in inner.index.values() {
-                for (lsn, _) in vec_map.as_slice() {
+                for (lsn, _pos) in vec_map.as_slice() {
                    assert!(*lsn < end_lsn);
                }
            }
@@ -805,23 +534,36 @@ impl InMemoryLayer {
        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+                assert_eq!(
+                    file_contents.len() % PAGE_SZ,
+                    0,
+                    "needed by BlockReaderRef::Slice"
+                );
+                assert_eq!(file_contents.len(), {
+                    let written = usize::try_from(inner.file.len()).unwrap();
+                    if written % PAGE_SZ == 0 {
+                        written
+                    } else {
+                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
+                    }
+                });

-                let file_contents = Bytes::from(file_contents);
+                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
+
+                let mut buf = Vec::new();

                for (key, vec_map) in inner.index.iter() {
                    // Write all page versions
-                    for (lsn, entry) in vec_map
-                        .as_slice()
-                        .iter()
-                        .map(|(lsn, entry)| (lsn, entry.unpack()))
-                    {
-                        let IndexEntryUnpacked {
-                            pos,
-                            len,
-                            will_init,
-                        } = entry;
-                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
-                        let (_buf, res) = delta_layer_writer
+                    for (lsn, pos) in vec_map.as_slice() {
+                        // TODO: once we have blob lengths in the in-memory index, we can
+                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                        // 2. load the file contents into a Bytes and
+                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                        // 4. pass that `buf` into `put_value_bytes`
+                        // => https://github.com/neondatabase/neon/issues/8183
+                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let (tmp, res) = delta_layer_writer
                            .put_value_bytes(
                                Key::from_compact(*key),
                                *lsn,
@@ -831,6 +573,7 @@ impl InMemoryLayer {
                            )
                            .await;
                        res?;
+                        buf = tmp.into_raw_slice().into_inner();
                    }
                }
            }
@@ -852,134 +595,3 @@ impl InMemoryLayer {
        Ok(Some((desc, path)))
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_index_entry() {
-        const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
-        use IndexEntryNewArgs as Args;
-        use IndexEntryUnpacked as Unpacked;
-
-        let roundtrip = |args, expect: Unpacked| {
-            let res = IndexEntry::new(args).expect("this tests expects no errors");
-            let IndexEntryUnpacked {
-                will_init,
-                len,
-                pos,
-            } = res.unpack();
-            assert_eq!(will_init, expect.will_init);
-            assert_eq!(len, expect.len);
-            assert_eq!(pos, expect.pos);
-        };
-
-        // basic roundtrip
-        for pos in [0, MAX_SUPPORTED_POS] {
-            for len in [0, MAX_SUPPORTED_BLOB_LEN] {
-                for will_init in [true, false] {
-                    let expect = Unpacked {
-                        will_init,
-                        len: len.into_u64(),
-                        pos: pos.into_u64(),
-                    };
-                    roundtrip(
-                        Args {
-                            will_init,
-                            base_offset: pos.into_u64(),
-                            batch_offset: 0,
-                            len,
-                        },
-                        expect,
-                    );
-                    roundtrip(
-                        Args {
-                            will_init,
-                            base_offset: 0,
-                            batch_offset: pos.into_u64(),
-                            len,
-                        },
-                        expect,
-                    );
-                }
-            }
-        }
-
-        // too-large len
-        let too_large = Args {
-            will_init: false,
-            len: MAX_SUPPORTED_BLOB_LEN + 1,
-            base_offset: 0,
-            batch_offset: 0,
-        };
-        assert!(IndexEntry::new(too_large).is_err());
-
-        // too-large pos
-        {
-            let too_large = Args {
-                will_init: false,
-                len: 0,
-                base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
-                batch_offset: 0,
-            };
-            assert!(IndexEntry::new(too_large).is_err());
-            let too_large = Args {
-                will_init: false,
-                len: 0,
-                base_offset: 0,
-                batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
-            };
-            assert!(IndexEntry::new(too_large).is_err());
-        }
-
-        // too large (base_offset + batch_offset)
-        {
-            let too_large = Args {
-                will_init: false,
-                len: 0,
-                base_offset: MAX_SUPPORTED_POS.into_u64(),
-                batch_offset: 1,
-            };
-            assert!(IndexEntry::new(too_large).is_err());
-            let too_large = Args {
-                will_init: false,
-                len: 0,
-                base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
-                batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
-            };
-            assert!(IndexEntry::new(too_large).is_err());
-        }
-
-        // valid special cases
-        // - area past the max supported pos that is accessible by len
-        for len in [1, MAX_SUPPORTED_BLOB_LEN] {
-            roundtrip(
-                Args {
-                    will_init: false,
-                    len,
-                    base_offset: MAX_SUPPORTED_POS.into_u64(),
-                    batch_offset: 0,
-                },
-                Unpacked {
-                    will_init: false,
-                    len: len as u64,
-                    pos: MAX_SUPPORTED_POS.into_u64(),
-                },
-            );
-            roundtrip(
-                Args {
-                    will_init: false,
-                    len,
-                    base_offset: 0,
-                    batch_offset: MAX_SUPPORTED_POS.into_u64(),
-                },
-                Unpacked {
-                    will_init: false,
-                    len: len as u64,
-                    pos: MAX_SUPPORTED_POS.into_u64(),
-                },
-            );
-        }
-    }
-}
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -1,937 +0,0 @@
-use std::{
-    collections::BTreeMap,
-    sync::{Arc, RwLock},
-};
-
-use itertools::Itertools;
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
-
-use crate::{
-    assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
-    context::RequestContext,
-};
-
-/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
-pub trait File: Send {
-    /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())`
-    /// and return the number of bytes read (let's call it `nread`).
-    /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes.
-    ///
-    /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`)
-    /// is if the file is shorter than `start+dst.len()`.
-    ///
-    /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an
-    /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
-    ///
-    /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-        &'b self,
-        start: u64,
-        dst: Slice<B>,
-        ctx: &'a RequestContext,
-    ) -> std::io::Result<(Slice<B>, usize)>;
-}
-
-/// A logical read from [`File`]. See [`Self::new`].
-pub struct LogicalRead<B: Buffer> {
-    pos: u64,
-    state: RwLockRefCell<LogicalReadState<B>>,
-}
-
-enum LogicalReadState<B: Buffer> {
-    NotStarted(B),
-    Ongoing(B),
-    Ok(B),
-    Error(Arc<std::io::Error>),
-    Undefined,
-}
-
-impl<B: Buffer> LogicalRead<B> {
-    /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`.
-    pub fn new(pos: u64, buf: B) -> Self {
-        Self {
-            pos,
-            state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)),
-        }
-    }
-    pub fn into_result(self) -> Option<Result<B, Arc<std::io::Error>>> {
-        match self.state.into_inner() {
-            LogicalReadState::Ok(buf) => Some(Ok(buf)),
-            LogicalReadState::Error(e) => Some(Err(e)),
-            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None,
-            LogicalReadState::Undefined => unreachable!(),
-        }
-    }
-}
-
-/// The buffer into which a [`LogicalRead`] result is placed.
-pub trait Buffer: std::ops::Deref<Target = [u8]> {
-    /// Immutable.
-    fn cap(&self) -> usize;
-    /// Changes only through [`Self::extend_from_slice`].
-    fn len(&self) -> usize;
-    /// Panics if the total length would exceed the initialized capacity.
-    fn extend_from_slice(&mut self, src: &[u8]);
-}
-
-/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO.
-const DIO_CHUNK_SIZE: usize = 512;
-
-/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`.
-/// (The unit is the number of chunks.)
-const MAX_CHUNK_BATCH_SIZE: usize = {
-    let desired = 128 * 1024; // 128k
-    if desired % DIO_CHUNK_SIZE != 0 {
-        panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
-        // compile-time error
-    }
-    desired / DIO_CHUNK_SIZE
-};
-
-/// Execute the given logical `reads` against `file`.
-/// The results are placed in the buffers of the [`LogicalRead`]s.
-/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`].
-///
-/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function.
-/// Otherwise, this function panics.
-pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext)
-where
-    I: IntoIterator<Item = &'a LogicalRead<B>>,
-    F: File,
-    B: Buffer + IoBufMut + Send,
-{
-    // Terminology:
-    // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity
-    // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges
-    // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests
-    // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity
-
-    // Preserve a copy of the logical reads for debug assertions at the end
-    #[cfg(debug_assertions)]
-    let (reads, assert_logical_reads) = {
-        let (reads, assert) = reads.into_iter().tee();
-        (reads, Some(Vec::from_iter(assert)))
-    };
-    #[cfg(not(debug_assertions))]
-    let (reads, assert_logical_reads): (_, Option<Vec<&'a LogicalRead<B>>>) = (reads, None);
-
-    // Plan which parts of which chunks need to be appended to which buffer
-    let mut by_chunk: BTreeMap<u64, Vec<Interest<B>>> = BTreeMap::new();
-    struct Interest<'a, B: Buffer> {
-        logical_read: &'a LogicalRead<B>,
-        offset_in_chunk: u64,
-        len: u64,
-    }
-    for logical_read in reads {
-        let LogicalRead { pos, state } = logical_read;
-        let mut state = state.borrow_mut();
-
-        // transition from NotStarted to Ongoing
-        let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined);
-        let req_len = match cur {
-            LogicalReadState::NotStarted(buf) => {
-                if buf.len() != 0 {
-                    panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`");
-                }
-                // buf.cap() == 0 is ok
-
-                // transition into Ongoing state
-                let req_len = buf.cap();
-                *state = LogicalReadState::Ongoing(buf);
-                req_len
-            }
-            x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"),
-        };
-
-        // plan which chunks we need to read from
-        let mut remaining = req_len;
-        let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64());
-        let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE;
-        while remaining > 0 {
-            let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
-            by_chunk.entry(chunk_no).or_default().push(Interest {
-                logical_read,
-                offset_in_chunk: offset_in_chunk.into_u64(),
-                len: remaining_in_chunk.into_u64(),
-            });
-            offset_in_chunk = 0;
-            chunk_no += 1;
-            remaining -= remaining_in_chunk;
-        }
-    }
-
-    // At this point, we could iterate over by_chunk, in chunk order,
-    // read each chunk from disk, and fill the buffers.
-    // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE
-    // so we issue fewer IOs = fewer roundtrips = lower overall latency.
-    struct PhysicalRead<'a, B: Buffer> {
-        start_chunk_no: u64,
-        nchunks: usize,
-        dsts: Vec<PhysicalInterest<'a, B>>,
-    }
-    struct PhysicalInterest<'a, B: Buffer> {
-        logical_read: &'a LogicalRead<B>,
-        offset_in_physical_read: u64,
-        len: u64,
-    }
-    let mut physical_reads: Vec<PhysicalRead<B>> = Vec::new();
-    let mut by_chunk = by_chunk.into_iter().peekable();
-    loop {
-        let mut last_chunk_no = None;
-        let to_merge: Vec<(u64, Vec<Interest<B>>)> = by_chunk
-            .peeking_take_while(|(chunk_no, _)| {
-                if let Some(last_chunk_no) = last_chunk_no {
-                    if *chunk_no != last_chunk_no + 1 {
-                        return false;
-                    }
-                }
-                last_chunk_no = Some(*chunk_no);
-                true
-            })
-            .take(MAX_CHUNK_BATCH_SIZE)
-            .collect(); // TODO: avoid this .collect()
-        let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
-            break;
-        };
-        let nchunks = to_merge.len();
-        let dsts = to_merge
-            .into_iter()
-            .enumerate()
-            .flat_map(|(i, (_, dsts))| {
-                dsts.into_iter().map(
-                    move |Interest {
-                              logical_read,
-                              offset_in_chunk,
-                              len,
-                          }| {
-                        PhysicalInterest {
-                            logical_read,
-                            offset_in_physical_read: i
-                                .checked_mul(DIO_CHUNK_SIZE)
-                                .unwrap()
-                                .into_u64()
-                                + offset_in_chunk,
-                            len,
-                        }
-                    },
-                )
-            })
-            .collect();
-        physical_reads.push(PhysicalRead {
-            start_chunk_no,
-            nchunks,
-            dsts,
-        });
-    }
-    drop(by_chunk);
-
-    // Execute physical reads and fill the logical read buffers
-    // TODO: pipelined reads; prefetch;
-    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
-    for PhysicalRead {
-        start_chunk_no,
-        nchunks,
-        dsts,
-    } in physical_reads
-    {
-        let all_done = dsts
-            .iter()
-            .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal());
-        if all_done {
-            continue;
-        }
-        let read_offset = start_chunk_no
-            .checked_mul(DIO_CHUNK_SIZE.into_u64())
-            .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier");
-        let io_buf = get_io_buffer(nchunks).slice_full();
-        let req_len = io_buf.len();
-        let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await
-        {
-            Ok(t) => t,
-            Err(e) => {
-                let e = Arc::new(e);
-                for PhysicalInterest { logical_read, .. } in dsts {
-                    *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e));
-                    // this will make later reads for the given LogicalRead short-circuit, see top of loop body
-                }
-                continue;
-            }
-        };
-        let io_buf = io_buf_slice.into_inner();
-        assert!(
-            nread <= io_buf.len(),
-            "the last chunk in the file can be a short read, so, no =="
-        );
-        let io_buf = &io_buf[..nread];
-        for PhysicalInterest {
-            logical_read,
-            offset_in_physical_read,
-            len,
-        } in dsts
-        {
-            let mut logical_read_state_borrow = logical_read.state.borrow_mut();
-            let logical_read_buf = match &mut *logical_read_state_borrow {
-                LogicalReadState::NotStarted(_) => {
-                    unreachable!("we transition it into Ongoing at function entry")
-                }
-                LogicalReadState::Ongoing(buf) => buf,
-                LogicalReadState::Ok(_) | LogicalReadState::Error(_) => {
-                    continue;
-                }
-                LogicalReadState::Undefined => unreachable!(),
-            };
-            let range_in_io_buf = std::ops::Range {
-                start: offset_in_physical_read as usize,
-                end: offset_in_physical_read as usize + len as usize,
-            };
-            assert!(range_in_io_buf.end >= range_in_io_buf.start);
-            if range_in_io_buf.end > nread {
-                let msg = format!(
-                    "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}",
-                    &*logical_read_state_borrow
-                );
-                logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new(
-                    std::io::ErrorKind::UnexpectedEof,
-                    msg,
-                )));
-                continue;
-            }
-            let data = &io_buf[range_in_io_buf];
-
-            // Copy data from io buffer into the logical read buffer.
-            // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.)
-            let pre = if cfg!(debug_assertions) {
-                Some((logical_read_buf.len(), logical_read_buf.cap()))
-            } else {
-                None
-            };
-            logical_read_buf.extend_from_slice(data);
-            let post = if cfg!(debug_assertions) {
-                Some((logical_read_buf.len(), logical_read_buf.cap()))
-            } else {
-                None
-            };
-            match (pre, post) {
-                (None, None) => {}
-                (Some(_), None) | (None, Some(_)) => unreachable!(),
-                (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => {
-                    assert_eq!(pre_len + len as usize, post_len);
-                    assert_eq!(pre_cap, post_cap);
-                }
-            }
-
-            if logical_read_buf.len() == logical_read_buf.cap() {
-                logical_read_state_borrow.transition_to_terminal(Ok(()));
-            }
-        }
-    }
-
-    if let Some(assert_logical_reads) = assert_logical_reads {
-        for logical_read in assert_logical_reads {
-            assert!(logical_read.state.borrow().is_terminal());
-        }
-    }
-}
-
-impl<B: Buffer> LogicalReadState<B> {
-    fn is_terminal(&self) -> bool {
-        match self {
-            LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false,
-            LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true,
-            LogicalReadState::Undefined => unreachable!(),
-        }
-    }
-    fn transition_to_terminal(&mut self, err: std::io::Result<()>) {
-        let cur = std::mem::replace(self, LogicalReadState::Undefined);
-        let buf = match cur {
-            LogicalReadState::Ongoing(buf) => buf,
-            x => panic!("must only call in state Ongoing, got {x:?}"),
-        };
-        *self = match err {
-            Ok(()) => LogicalReadState::Ok(buf),
-            Err(e) => LogicalReadState::Error(Arc::new(e)),
-        };
-    }
-}
-
-impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        #[derive(Debug)]
-        #[allow(unused)]
-        struct BufferDebug {
-            len: usize,
-            cap: usize,
-        }
-        impl<'a> From<&'a dyn Buffer> for BufferDebug {
-            fn from(buf: &'a dyn Buffer) -> Self {
-                Self {
-                    len: buf.len(),
-                    cap: buf.cap(),
-                }
-            }
-        }
-        match self {
-            LogicalReadState::NotStarted(b) => {
-                write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer))
-            }
-            LogicalReadState::Ongoing(b) => {
-                write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
-            }
-            LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
-            LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
-            LogicalReadState::Undefined => write!(f, "Undefined"),
-        }
-    }
-}
-
-#[derive(Debug)]
-struct RwLockRefCell<T>(RwLock<T>);
-impl<T> RwLockRefCell<T> {
-    fn new(value: T) -> Self {
-        Self(RwLock::new(value))
-    }
-    fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
-        self.0.try_read().unwrap()
-    }
-    fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
-        self.0.try_write().unwrap()
-    }
-    fn into_inner(self) -> T {
-        self.0.into_inner().unwrap()
-    }
-}
-
-impl Buffer for Vec<u8> {
-    fn cap(&self) -> usize {
-        self.capacity()
-    }
-
-    fn len(&self) -> usize {
-        self.len()
-    }
-
-    fn extend_from_slice(&mut self, src: &[u8]) {
-        if self.len() + src.len() > self.cap() {
-            panic!("Buffer capacity exceeded");
-        }
-        Vec::extend_from_slice(self, src);
-    }
-}
-
-#[cfg(test)]
-#[allow(clippy::assertions_on_constants)]
-mod tests {
-    use rand::Rng;
-
-    use crate::{
-        context::DownloadBehavior, task_mgr::TaskKind,
-        virtual_file::owned_buffers_io::slice::SliceMutExt,
-    };
-
-    use super::*;
-    use std::{cell::RefCell, collections::VecDeque};
-
-    struct InMemoryFile {
-        content: Vec<u8>,
-    }
-
-    impl InMemoryFile {
-        fn new_random(len: usize) -> Self {
-            Self {
-                content: rand::thread_rng()
-                    .sample_iter(rand::distributions::Standard)
-                    .take(len)
-                    .collect(),
-            }
-        }
-        fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead {
-            let expected_result = if pos as usize + len > self.content.len() {
-                Err("InMemoryFile short read".to_string())
-            } else {
-                Ok(self.content[pos as usize..pos as usize + len].to_vec())
-            };
-            TestLogicalRead::new(pos, len, expected_result)
-        }
-    }
-
-    #[test]
-    fn test_in_memory_file() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let file = InMemoryFile::new_random(10);
-        let test_read = |pos, len| {
-            let buf = vec![0; len];
-            let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
-            use futures::FutureExt;
-            let (slice, nread) = fut
-                .now_or_never()
-                .expect("impl never awaits")
-                .expect("impl never errors");
-            let mut buf = slice.into_inner();
-            buf.truncate(nread);
-            buf
-        };
-        assert_eq!(test_read(0, 1), &file.content[0..1]);
-        assert_eq!(test_read(1, 2), &file.content[1..3]);
-        assert_eq!(test_read(9, 2), &file.content[9..]);
-        assert!(test_read(10, 2).is_empty());
-        assert!(test_read(11, 2).is_empty());
-    }
-
-    impl File for InMemoryFile {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-            &'b self,
-            start: u64,
-            mut dst: Slice<B>,
-            _ctx: &'a RequestContext,
-        ) -> std::io::Result<(Slice<B>, usize)> {
-            let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
-            let nread = {
-                let req_len = dst_slice.len();
-                let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize));
-                if start as usize >= self.content.len() {
-                    0
-                } else {
-                    dst_slice[..len]
-                        .copy_from_slice(&self.content[start as usize..start as usize + len]);
-                    len
-                }
-            };
-            rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs
-            Ok((dst, nread))
-        }
-    }
-
-    #[derive(Clone)]
-    struct TestLogicalRead {
-        pos: u64,
-        len: usize,
-        expected_result: Result<Vec<u8>, String>,
-    }
-
-    impl TestLogicalRead {
-        fn new(pos: u64, len: usize, expected_result: Result<Vec<u8>, String>) -> Self {
-            Self {
-                pos,
-                len,
-                expected_result,
-            }
-        }
-        fn make_logical_read(&self) -> LogicalRead<Vec<u8>> {
-            LogicalRead::new(self.pos, Vec::with_capacity(self.len))
-        }
-    }
-
-    async fn execute_and_validate_test_logical_reads<I, F>(
-        file: &F,
-        test_logical_reads: I,
-        ctx: &RequestContext,
-    ) where
-        I: IntoIterator<Item = TestLogicalRead>,
-        F: File,
-    {
-        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
-        let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::<Vec<_>>();
-        execute(file, logical_reads.iter(), ctx).await;
-        for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) {
-            let actual = logical_read.into_result().expect("we call execute()");
-            match (actual, test_logical_read.expected_result) {
-                (Ok(actual), Ok(expected)) if actual == expected => {}
-                (Err(actual), Err(expected)) => {
-                    assert_eq!(actual.to_string(), expected);
-                }
-                (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"),
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn test_blackbox() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let cs = DIO_CHUNK_SIZE;
-        let cs_u64 = cs.into_u64();
-
-        let file = InMemoryFile::new_random(10 * cs);
-
-        let test_logical_reads = vec![
-            file.test_logical_read(0, 1),
-            // adjacent to logical_read0
-            file.test_logical_read(1, 2),
-            // gap
-            // spans adjacent chunks
-            file.test_logical_read(cs_u64 - 1, 2),
-            // gap
-            //  tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5
-            file.test_logical_read(3 * cs_u64 - 1, cs + 2),
-            // gap
-            file.test_logical_read(5 * cs_u64, 1),
-        ];
-        let num_test_logical_reads = test_logical_reads.len();
-        let test_logical_reads_perms = test_logical_reads
-            .into_iter()
-            .permutations(num_test_logical_reads);
-
-        // test all orderings of LogicalReads, the order shouldn't matter for the results
-        for test_logical_reads in test_logical_reads_perms {
-            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
-        }
-    }
-
-    #[tokio::test]
-    #[should_panic]
-    async fn test_reusing_logical_reads_panics() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        let file = InMemoryFile::new_random(DIO_CHUNK_SIZE);
-        let a = file.test_logical_read(23, 10);
-        let logical_reads = vec![a.make_logical_read()];
-        execute(&file, &logical_reads, &ctx).await;
-        // reuse pancis
-        execute(&file, &logical_reads, &ctx).await;
-    }
-
-    struct RecorderFile<'a> {
-        recorded: RefCell<Vec<RecordedRead>>,
-        file: &'a InMemoryFile,
-    }
-
-    struct RecordedRead {
-        pos: u64,
-        req_len: usize,
-        res: Vec<u8>,
-    }
-
-    impl<'a> RecorderFile<'a> {
-        fn new(file: &'a InMemoryFile) -> RecorderFile<'a> {
-            Self {
-                recorded: Default::default(),
-                file,
-            }
-        }
-    }
-
-    impl<'x> File for RecorderFile<'x> {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-            &'b self,
-            start: u64,
-            dst: Slice<B>,
-            ctx: &'a RequestContext,
-        ) -> std::io::Result<(Slice<B>, usize)> {
-            let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?;
-            self.recorded.borrow_mut().push(RecordedRead {
-                pos: start,
-                req_len: dst.bytes_total(),
-                res: Vec::from(&dst[..nread]),
-            });
-            Ok((dst, nread))
-        }
-    }
-
-    #[tokio::test]
-    async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-
-        let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE);
-
-        let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10);
-        let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20);
-
-        let recorder = RecorderFile::new(&file);
-
-        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
-
-        let recorded = recorder.recorded.borrow();
-        assert_eq!(recorded.len(), 1);
-        let RecordedRead { pos, req_len, .. } = &recorded[0];
-        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
-        assert_eq!(*req_len, DIO_CHUNK_SIZE);
-    }
-
-    #[tokio::test]
-    async fn test_max_chunk_batch_size_is_respected() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-
-        let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
-
-        // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE
-        assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption");
-        assert!(10 < DIO_CHUNK_SIZE, "test assumption");
-        let mut test_logical_reads = Vec::new();
-        for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 {
-            test_logical_reads
-                .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1));
-        }
-
-        let recorder = RecorderFile::new(&file);
-
-        execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await;
-
-        let recorded = recorder.recorded.borrow();
-        assert_eq!(recorded.len(), 2);
-        {
-            let RecordedRead { pos, req_len, .. } = &recorded[0];
-            assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE);
-            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
-        }
-        {
-            let RecordedRead { pos, req_len, .. } = &recorded[1];
-            assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE);
-            assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE);
-        }
-    }
-
-    #[tokio::test]
-    async fn test_batch_breaks_if_chunk_is_not_interesting() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-
-        assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption");
-        let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE);
-
-        let a = file.test_logical_read(0, 1); // chunk 0
-        let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2
-
-        let recorder = RecorderFile::new(&file);
-
-        execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
-
-        let recorded = recorder.recorded.borrow();
-
-        assert_eq!(recorded.len(), 2);
-        {
-            let RecordedRead { pos, req_len, .. } = &recorded[0];
-            assert_eq!(*pos, 0);
-            assert_eq!(*req_len, DIO_CHUNK_SIZE);
-        }
-        {
-            let RecordedRead { pos, req_len, .. } = &recorded[1];
-            assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64());
-            assert_eq!(*req_len, DIO_CHUNK_SIZE);
-        }
-    }
-
-    struct ExpectedRead {
-        expect_pos: u64,
-        expect_len: usize,
-        respond: Result<Vec<u8>, String>,
-    }
-
-    struct MockFile {
-        expected: RefCell<VecDeque<ExpectedRead>>,
-    }
-
-    impl Drop for MockFile {
-        fn drop(&mut self) {
-            assert!(
-                self.expected.borrow().is_empty(),
-                "expected reads not satisfied"
-            );
-        }
-    }
-
-    macro_rules! mock_file {
-        ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{
-            MockFile {
-                expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead {
-                    expect_pos: $pos,
-                    expect_len: $len,
-                    respond: $respond,
-                }),*])),
-            }
-        }};
-    }
-
-    impl File for MockFile {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
-            &'b self,
-            start: u64,
-            mut dst: Slice<B>,
-            _ctx: &'a RequestContext,
-        ) -> std::io::Result<(Slice<B>, usize)> {
-            let ExpectedRead {
-                expect_pos,
-                expect_len,
-                respond,
-            } = self
-                .expected
-                .borrow_mut()
-                .pop_front()
-                .expect("unexpected read");
-            assert_eq!(start, expect_pos);
-            assert_eq!(dst.bytes_total(), expect_len);
-            match respond {
-                Ok(mocked_bytes) => {
-                    let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len());
-                    let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
-                    dst_slice[..len].copy_from_slice(&mocked_bytes[..len]);
-                    rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
-                    Ok((dst, len))
-                }
-                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
-            }
-        }
-    }
-
-    #[tokio::test]
-    async fn test_mock_file() {
-        // Self-test to ensure the relevant features of mock file work as expected.
-
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-
-        let mock_file = mock_file! {
-            0    , 512 => Ok(vec![0; 512]),
-            512  , 512 => Ok(vec![1; 512]),
-            1024 , 512 => Ok(vec![2; 10]),
-            2048,  1024 => Err("foo".to_owned()),
-        };
-
-        let buf = Vec::with_capacity(512);
-        let (buf, nread) = mock_file
-            .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
-            .await
-            .unwrap();
-        assert_eq!(nread, 512);
-        assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
-
-        let buf = Vec::with_capacity(512);
-        let (buf, nread) = mock_file
-            .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
-            .await
-            .unwrap();
-        assert_eq!(nread, 512);
-        assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
-
-        let buf = Vec::with_capacity(512);
-        let (buf, nread) = mock_file
-            .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
-            .await
-            .unwrap();
-        assert_eq!(nread, 10);
-        assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
-
-        let buf = Vec::with_capacity(1024);
-        let err = mock_file
-            .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
-            .await
-            .err()
-            .unwrap();
-        assert_eq!(err.to_string(), "foo");
-    }
-
-    #[tokio::test]
-    async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-
-        let test_logical_reads = vec![
-            // read spanning two batches
-            TestLogicalRead::new(
-                DIO_CHUNK_SIZE.into_u64() / 2,
-                MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE,
-                Err("foo".to_owned()),
-            ),
-            // second read in failing chunk
-            TestLogicalRead::new(
-                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10,
-                5,
-                Err("foo".to_owned()),
-            ),
-            // read unaffected
-            TestLogicalRead::new(
-                (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64()
-                    + 2 * DIO_CHUNK_SIZE.into_u64()
-                    + 10,
-                5,
-                Ok(vec![1; 5]),
-            ),
-        ];
-        let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
-        let test_logical_read_perms = tmp.permutations(test_logical_reads.len());
-
-        for test_logical_reads in test_logical_read_perms {
-            let file = mock_file!(
-                0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]),
-                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()),
-                (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]),
-            );
-            execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
-        }
-    }
-
-    struct TestShortReadsSetup {
-        ctx: RequestContext,
-        file: InMemoryFile,
-        written: u64,
-    }
-    fn setup_short_chunk_read_tests() -> TestShortReadsSetup {
-        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        assert!(DIO_CHUNK_SIZE > 20, "test assumption");
-        let written = (2 * DIO_CHUNK_SIZE - 10).into_u64();
-        let file = InMemoryFile::new_random(written as usize);
-        TestShortReadsSetup { ctx, file, written }
-    }
-
-    #[tokio::test]
-    async fn test_short_chunk_read_from_written_range() {
-        // Test what happens if there are logical reads
-        // that start within the last chunk, and
-        // the last chunk is not the full chunk length.
-        //
-        // The read should succeed despite the short chunk length.
-        let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
-
-        let a = file.test_logical_read(written - 10, 5);
-        let recorder = RecorderFile::new(&file);
-
-        execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await;
-
-        let recorded = recorder.recorded.borrow();
-        assert_eq!(recorded.len(), 1);
-        let RecordedRead { pos, req_len, res } = &recorded[0];
-        assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
-        assert_eq!(*req_len, DIO_CHUNK_SIZE);
-        assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
-    }
-
-    #[tokio::test]
-    async fn test_short_chunk_read_and_logical_read_from_unwritten_range() {
-        // Test what happens if there are logical reads
-        // that start within the last chunk, and
-        // the last chunk is not the full chunk length, and
-        // the logical reads end in the unwritten range.
-        //
-        // All should fail with UnexpectedEof and have the same IO pattern.
-        async fn the_impl(offset_delta: i64) {
-            let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
-
-            let offset = u64::try_from(
-                i64::try_from(written)
-                    .unwrap()
-                    .checked_add(offset_delta)
-                    .unwrap(),
-            )
-            .unwrap();
-            let a = file.test_logical_read(offset, 5);
-            let recorder = RecorderFile::new(&file);
-            let a_vr = a.make_logical_read();
-            execute(&recorder, vec![&a_vr], &ctx).await;
-
-            // validate the LogicalRead result
-            let a_res = a_vr.into_result().unwrap();
-            let a_err = a_res.unwrap_err();
-            assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof);
-
-            // validate the IO pattern
-            let recorded = recorder.recorded.borrow();
-            assert_eq!(recorded.len(), 1);
-            let RecordedRead { pos, req_len, res } = &recorded[0];
-            assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
-            assert_eq!(*req_len, DIO_CHUNK_SIZE);
-            assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
-        }
-
-        the_impl(-1).await; // start == length - 1
-        the_impl(0).await; // start == length
-        the_impl(1).await; // start == length + 1
-    }
-
-    // TODO: mixed: some valid, some UnexpectedEof
-
-    // TODO: same tests but with merges
-}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -35,8 +35,6 @@ mod tests;
 #[cfg(test)]
 mod failpoints;

-pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
-
 /// A Layer contains all data in a "rectangle" consisting of a range of keys and
 /// range of LSNs.
 ///
@@ -1298,10 +1296,7 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(
-                    &self.layer_desc().key_range,
-                    self.layer_desc().is_delta,
-                ),
+                l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
@@ -1494,9 +1489,8 @@ impl LayerInner {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
                    Ok(elapsed) => {
-                        let accessed_and_visible = self.access_stats.accessed()
-                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
-                        if accessed_and_visible {
+                        let accessed = self.access_stats.accessed();
+                        if accessed {
                            // Only layers used for reads contribute to our "low residence" metric that is used
                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
                            // to be rapidly evicted without contributing to this metric.
@@ -1510,7 +1504,7 @@ impl LayerInner {

                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
-                            accessed_and_visible,
+                            accessed,
                            "evicted layer after known residence period"
                        );
                    }
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -256,10 +256,6 @@ impl LayerName {
            LayerName::Delta(layer) => &layer.key_range,
        }
    }
-
-    pub fn is_delta(&self) -> bool {
-        matches!(self, LayerName::Delta(_))
-    }
 }

 impl fmt::Display for LayerName {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -3,7 +3,6 @@ use std::{
    collections::{binary_heap, BinaryHeap},
 };

-use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;

@@ -27,13 +26,6 @@ impl<'a> LayerRef<'a> {
            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 enum LayerIterRef<'a> {
@@ -48,13 +40,6 @@ impl LayerIterRef<'_> {
            Self::Image(x) => x.next().await,
        }
    }
-
-    fn layer_dbg_info(&self) -> String {
-        match self {
-            Self::Image(x) => x.layer_dbg_info(),
-            Self::Delta(x) => x.layer_dbg_info(),
-        }
-    }
 }

 /// This type plays several roles at once
@@ -90,11 +75,6 @@ impl<'a> PeekableLayerIterRef<'a> {
    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
        let result = self.peeked.take();
        self.peeked = self.iter.next().await?;
-        if let (Some((k1, l1, _)), Some((k2, l2, _))) = (&self.peeked, &result) {
-            if (k1, l1) < (k2, l2) {
-                bail!("iterator is not ordered: {}", self.iter.layer_dbg_info());
-            }
-        }
        Ok(result)
    }
 }
@@ -198,12 +178,7 @@ impl<'a> IteratorWrapper<'a> {
        let iter = PeekableLayerIterRef::create(iter).await?;
        if let Some((k1, l1, _)) = iter.peek() {
            let (k2, l2) = first_key_lower_bound;
-            if (k1, l1) < (k2, l2) {
-                bail!(
-                    "layer key range did not include the first key in the layer: {}",
-                    layer.layer_dbg_info()
-                );
-            }
+            debug_assert!((k1, l1) >= (k2, l2));
        }
        *self = Self::Loaded { iter };
        Ok(())
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -1,4 +1,4 @@
-use std::{future::Future, ops::Range, sync::Arc};
+use std::{ops::Range, sync::Arc};

 use bytes::Bytes;
 use pageserver_api::key::{Key, KEY_SIZE};
@@ -7,32 +7,7 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 use crate::tenant::storage_layer::Layer;
 use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};

-use super::layer::S3_UPLOAD_LIMIT;
-use super::{
-    DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
-};
-
-pub(crate) enum SplitWriterResult {
-    Produced(ResidentLayer),
-    Discarded(PersistentLayerKey),
-}
-
-#[cfg(test)]
-impl SplitWriterResult {
-    fn into_resident_layer(self) -> ResidentLayer {
-        match self {
-            SplitWriterResult::Produced(layer) => layer,
-            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
-        }
-    }
-
-    fn into_discarded_layer(self) -> PersistentLayerKey {
-        match self {
-            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
-            SplitWriterResult::Discarded(layer) => layer,
-        }
-    }
-}
+use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};

 /// An image writer that takes images and produces multiple image layers. The interface does not
 /// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
@@ -41,12 +16,11 @@ impl SplitWriterResult {
 pub struct SplitImageLayerWriter {
    inner: ImageLayerWriter,
    target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layers: Vec<ResidentLayer>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn: Lsn,
-    start_key: Key,
 }

 impl SplitImageLayerWriter {
@@ -75,22 +49,16 @@ impl SplitImageLayerWriter {
            timeline_id,
            tenant_shard_id,
            lsn,
-            start_key,
        })
    }

-    pub async fn put_image_with_discard_fn<D, F>(
+    pub async fn put_image(
        &mut self,
        key: Key,
        img: Bytes,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
        // The current estimation is an upper bound of the space that the key/image could take
        // because we did not consider compression in this estimation. The resulting image layer
        // could be smaller than the target size.
@@ -108,87 +76,33 @@ impl SplitImageLayerWriter {
            )
            .await?;
            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
-            let layer_key = PersistentLayerKey {
-                key_range: self.start_key..key,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-                is_delta: false,
-            };
-            self.start_key = key;
-
-            if discard(&layer_key).await {
-                drop(prev_image_writer);
-                self.generated_layers
-                    .push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                self.generated_layers.push(SplitWriterResult::Produced(
-                    prev_image_writer
-                        .finish_with_end_key(tline, key, ctx)
-                        .await?,
-                ));
-            }
+            self.generated_layers.push(
+                prev_image_writer
+                    .finish_with_end_key(tline, key, ctx)
+                    .await?,
+            );
        }
        self.inner.put_image(key, img, ctx).await
    }

-    #[cfg(test)]
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
-            .await
-    }
-
-    pub(crate) async fn finish_with_discard_fn<D, F>(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-        discard: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
-        }
-        let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-            is_delta: false,
-        };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            generated_layers.push(SplitWriterResult::Produced(
-                inner.finish_with_end_key(tline, end_key, ctx).await?,
-            ));
-        }
-        Ok(generated_layers)
-    }
-
-    #[cfg(test)]
    pub(crate) async fn finish(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
-            .await
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+        generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
+        Ok(generated_layers)
    }

    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
 }
@@ -196,21 +110,15 @@ impl SplitImageLayerWriter {
 /// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
 /// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
 /// to be cleaned up).
-///
-/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
-/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
-/// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
    inner: DeltaLayerWriter,
    target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layers: Vec<ResidentLayer>,
    conf: &'static PageServerConf,
    timeline_id: TimelineId,
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
-    last_key_written: Key,
-    start_key: Key,
 }

 impl SplitDeltaLayerWriter {
@@ -239,74 +147,9 @@ impl SplitDeltaLayerWriter {
            timeline_id,
            tenant_shard_id,
            lsn_range,
-            last_key_written: Key::MIN,
-            start_key,
        })
    }

-    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
-    pub async fn put_value_with_discard_fn<D, F>(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
-        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
-        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
-        //
-        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
-        // strategy. https://github.com/neondatabase/neon/issues/8837
-        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if self.inner.num_keys() >= 1
-            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
-        {
-            if key != self.last_key_written {
-                let next_delta_writer = DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    key,
-                    self.lsn_range.clone(),
-                    ctx,
-                )
-                .await?;
-                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
-                let layer_key = PersistentLayerKey {
-                    key_range: self.start_key..key,
-                    lsn_range: self.lsn_range.clone(),
-                    is_delta: true,
-                };
-                self.start_key = key;
-                if discard(&layer_key).await {
-                    drop(prev_delta_writer);
-                    self.generated_layers
-                        .push(SplitWriterResult::Discarded(layer_key));
-                } else {
-                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-                    self.generated_layers
-                        .push(SplitWriterResult::Produced(delta_layer));
-                }
-            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
-                // We have to produce a very large file b/c a key is updated too often.
-                anyhow::bail!(
-                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
-                    key,
-                    self.inner.estimated_size()
-                );
-            }
-        }
-        self.last_key_written = key;
-        self.inner.put_value(key, lsn, val, ctx).await
-    }
-
    pub async fn put_value(
        &mut self,
        key: Key,
@@ -315,64 +158,56 @@ impl SplitDeltaLayerWriter {
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
-            .await
-    }
-
-    pub(crate) async fn finish_with_discard_fn<D, F>(
-        self,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        end_key: Key,
-        discard: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
-        let Self {
-            mut generated_layers,
-            inner,
-            ..
-        } = self;
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
-        }
-        let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
-            lsn_range: self.lsn_range.clone(),
-            is_delta: true,
-        };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            let (desc, path) = inner.finish(end_key, ctx).await?;
+        // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
+        // number, and therefore the final layer size could be a little bit larger or smaller than the target.
+        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        {
+            let next_delta_writer = DeltaLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                key,
+                self.lsn_range.clone(),
+                ctx,
+            )
+            .await?;
+            let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
+            let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+            self.generated_layers.push(delta_layer);
        }
-        Ok(generated_layers)
+        self.inner.put_value(key, lsn, val, ctx).await
    }

-    #[allow(dead_code)]
    pub(crate) async fn finish(
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Key,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
-            .await
+    ) -> anyhow::Result<Vec<ResidentLayer>> {
+        let Self {
+            mut generated_layers,
+            inner,
+            ..
+        } = self;
+
+        let (desc, path) = inner.finish(end_key, ctx).await?;
+        let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+        generated_layers.push(delta_layer);
+        Ok(generated_layers)
    }

    /// When split writer fails, the caller should call this function and handle partially generated layers.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
+    #[allow(dead_code)]
+    pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
 }

 #[cfg(test)]
 mod tests {
-    use itertools::Itertools;
    use rand::{RngCore, SeedableRng};

    use crate::{
@@ -467,16 +302,9 @@ mod tests {

    #[tokio::test]
    async fn write_split() {
-        write_split_helper("split_writer_write_split", false).await;
-    }
-
-    #[tokio::test]
-    async fn write_split_discard() {
-        write_split_helper("split_writer_write_split_discard", false).await;
-    }
-
-    async fn write_split_helper(harness_name: &'static str, discard: bool) {
-        let harness = TenantHarness::create(harness_name).await.unwrap();
+        let harness = TenantHarness::create("split_writer_write_split")
+            .await
+            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let tline = tenant
@@ -510,19 +338,16 @@ mod tests {
        for i in 0..N {
            let i = i as u32;
            image_writer
-                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
-                    discard
-                })
+                .put_image(get_key(i), get_large_img(), &tline, &ctx)
                .await
                .unwrap();
            delta_writer
-                .put_value_with_discard_fn(
+                .put_value(
                    get_key(i),
                    Lsn(0x20),
                    Value::Image(get_large_img()),
                    &tline,
                    &ctx,
-                    |_| async { discard },
                )
                .await
                .unwrap();
@@ -535,39 +360,22 @@ mod tests {
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        if discard {
-            for layer in image_layers {
-                layer.into_discarded_layer();
-            }
-            for layer in delta_layers {
-                layer.into_discarded_layer();
-            }
-        } else {
-            let image_layers = image_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            let delta_layers = delta_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            assert_eq!(image_layers.len(), N / 512 + 1);
-            assert_eq!(delta_layers.len(), N / 512 + 1);
-            for idx in 0..image_layers.len() {
-                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-                if idx > 0 {
-                    assert_eq!(
-                        image_layers[idx - 1].layer_desc().key_range.end,
-                        image_layers[idx].layer_desc().key_range.start
-                    );
-                    assert_eq!(
-                        delta_layers[idx - 1].layer_desc().key_range.end,
-                        delta_layers[idx].layer_desc().key_range.start
-                    );
-                }
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
+            if idx > 0 {
+                assert_eq!(
+                    image_layers[idx - 1].layer_desc().key_range.end,
+                    image_layers[idx].layer_desc().key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].layer_desc().key_range.end,
+                    delta_layers[idx].layer_desc().key_range.start
+                );
            }
        }
    }
@@ -648,49 +456,4 @@ mod tests {
            .unwrap();
        assert_eq!(layers.len(), 2);
    }
-
-    #[tokio::test]
-    async fn write_split_single_key() {
-        let harness = TenantHarness::create("split_writer_write_split_single_key")
-            .await
-            .unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        const N: usize = 2000;
-        let mut delta_writer = SplitDeltaLayerWriter::new(
-            tenant.conf,
-            tline.timeline_id,
-            tenant.tenant_shard_id,
-            get_key(0),
-            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
-            4 * 1024 * 1024,
-            &ctx,
-        )
-        .await
-        .unwrap();
-
-        for i in 0..N {
-            let i = i as u32;
-            delta_writer
-                .put_value(
-                    get_key(0),
-                    Lsn(i as u64 * 16 + 0x10),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                )
-                .await
-                .unwrap();
-        }
-        let delta_layers = delta_writer
-            .finish(&tline, &ctx, get_key(N as u32))
-            .await
-            .unwrap();
-        assert_eq!(delta_layers.len(), 1);
-    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -192,28 +192,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

+            let started_at = Instant::now();

-
-            let sleep_duration;
-            if period == Duration::ZERO {
+            let sleep_duration = if period == Duration::ZERO {
                #[cfg(not(feature = "testing"))]
                info!("automatic compaction is disabled");
                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10)
+                Duration::from_secs(10)
            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Compaction,
-                };
-
                // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
-                match output {
+                match tenant.compaction_iteration(&cancel, &ctx).await {
                    Ok(has_pending_task) => {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        if has_pending_task { Duration::ZERO } else { period }
                    }
                    Err(e) => {
                        let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -229,14 +221,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                            &wait_duration,
                            cancel.is_cancelled(),
                        );
-                        sleep_duration = wait_duration;
+                        wait_duration
                    }
                }
-
-                // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
            };

+            let elapsed = started_at.elapsed();
+            warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction);
+
+            // the duration is recorded by performance tests by enabling debug in this function
+            tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");

            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
@@ -374,27 +368,23 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

+            let started_at = Instant::now();
+
            let gc_horizon = tenant.get_gc_horizon();
-            let sleep_duration;
-            if period == Duration::ZERO || gc_horizon == 0 {
+            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
                #[cfg(not(feature = "testing"))]
                info!("automatic GC is disabled");
                // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
+                Duration::from_secs(10)
            } else {
-                let iteration = Iteration {
-                    started_at: Instant::now(),
-                    period,
-                    kind: BackgroundLoopKind::Gc,
-                };
                // Run gc
-                let IterationResult { output, elapsed: _ } =
-                    iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx))
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
-                match output {
+                match res {
                    Ok(_) => {
                        error_run_count = 0;
-                        sleep_duration = period;
+                        period
                    }
                    Err(crate::tenant::GcError::TenantCancelled) => {
                        return;
@@ -418,11 +408,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                            error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
                        }

-                        sleep_duration = wait_duration;
+                        wait_duration
                    }
                }
            };

+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
                .is_ok()
@@ -476,12 +468,14 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                break;
            }

-            let iteration = Iteration {
-                started_at: Instant::now(),
+            let started_at = Instant::now();
+            tenant.ingest_housekeeping().await;
+
+            warn_when_period_overrun(
+                started_at.elapsed(),
                period,
-                kind: BackgroundLoopKind::IngestHouseKeeping,
-            };
-            iteration.run(tenant.ingest_housekeeping()).await;
+                BackgroundLoopKind::IngestHouseKeeping,
+            );
        }
    }
    .await;
@@ -559,54 +553,6 @@ pub(crate) async fn delay_by_lease_length(
    }
 }

-struct Iteration {
-    started_at: Instant,
-    period: Duration,
-    kind: BackgroundLoopKind,
-}
-
-struct IterationResult<O> {
-    output: O,
-    elapsed: Duration,
-}
-
-impl Iteration {
-    #[instrument(skip_all)]
-    pub(crate) async fn run<Fut, O>(self, fut: Fut) -> IterationResult<O>
-    where
-        Fut: std::future::Future<Output = O>,
-    {
-        let Self {
-            started_at,
-            period,
-            kind,
-        } = self;
-
-        let mut fut = std::pin::pin!(fut);
-
-        // Wrap `fut` into a future that logs a message every `period` so that we get a
-        // very obvious breadcrumb in the logs _while_ a slow iteration is happening.
-        let liveness_logger = async move {
-            loop {
-                match tokio::time::timeout(period, &mut fut).await {
-                    Ok(x) => return x,
-                    Err(_) => {
-                        // info level as per the same rationale why warn_when_period_overrun is info
-                        // =>  https://github.com/neondatabase/neon/pull/5724
-                        info!("still running");
-                    }
-                }
-            }
-        };
-
-        let output = liveness_logger.await;
-
-        let elapsed = started_at.elapsed();
-        warn_when_period_overrun(elapsed, period, kind);
-
-        IterationResult { output, elapsed }
-    }
-}
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(
    elapsed: Duration,
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -10,7 +10,6 @@ use std::{
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
 use tracing::{error, warn};
-use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};

 use crate::{context::RequestContext, task_mgr::TaskKind};

@@ -34,7 +33,8 @@ pub struct Throttle<M: Metric> {

 pub struct Inner {
    task_kinds: EnumSet<TaskKind>,
-    rate_limiter: Arc<RateLimiter>,
+    rate_limiter: Arc<leaky_bucket::RateLimiter>,
+    config: Config,
 }

 pub type Config = pageserver_api::models::ThrottleConfig;
@@ -77,7 +77,8 @@ where
            refill_interval,
            refill_amount,
            max,
-        } = config;
+            fair,
+        } = &config;
        let task_kinds: EnumSet<TaskKind> = task_kinds
            .iter()
            .filter_map(|s| match TaskKind::from_str(s) {
@@ -92,21 +93,18 @@ where
                }
            })
            .collect();
-
-        // steady rate, we expect `refill_amount` requests per `refill_interval`.
-        // dividing gives us the rps.
-        let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64();
-        let config = LeakyBucketConfig::new(rps, f64::from(max));
-
-        // initial tracks how many tokens are available to put in the bucket
-        // we want how many tokens are currently in the bucket
-        let initial_tokens = max - initial;
-
-        let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
-
        Inner {
            task_kinds,
-            rate_limiter: Arc::new(rate_limiter),
+            rate_limiter: Arc::new(
+                leaky_bucket::RateLimiter::builder()
+                    .initial(*initial)
+                    .interval(*refill_interval)
+                    .refill(refill_amount.get())
+                    .max(*max)
+                    .fair(*fair)
+                    .build(),
+            ),
+            config,
        }
    }
    pub fn reconfigure(&self, config: Config) {
@@ -129,7 +127,7 @@ where

    /// See [`Config::steady_rps`].
    pub fn steady_rps(&self) -> f64 {
-        self.inner.load().rate_limiter.steady_rps()
+        self.inner.load().config.steady_rps()
    }

    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
@@ -138,9 +136,18 @@ where
            return None;
        };
        let start = std::time::Instant::now();
-
-        let did_throttle = inner.rate_limiter.acquire(key_count).await;
-
+        let mut did_throttle = false;
+        let acquire = inner.rate_limiter.acquire(key_count);
+        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
+        let acquire = tokio::task::unconstrained(acquire);
+        let mut acquire = std::pin::pin!(acquire);
+        std::future::poll_fn(|cx| {
+            use std::future::Future;
+            let poll = acquire.as_mut().poll(cx);
+            did_throttle = did_throttle || poll.is_pending();
+            poll
+        })
+        .await;
        self.count_accounted.fetch_add(1, Ordering::Relaxed);
        if did_throttle {
            self.count_throttled.fetch_add(1, Ordering::Relaxed);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -22,8 +22,8 @@ use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -44,8 +44,10 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
+    bin_ser::BeSer,
    fs_ext, pausable_failpoint,
    sync::gate::{Gate, GateGuard},
+    vec_map::VecMap,
 };

 use std::pin::pin;
@@ -69,7 +71,7 @@ use crate::{
        config::defaults::DEFAULT_PITR_INTERVAL,
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
+        storage_layer::PersistentLayerDesc,
    },
    walredo,
 };
@@ -135,10 +137,7 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::{
-    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
-    upload_queue::NotInitialized,
-};
+use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
@@ -218,7 +217,7 @@ pub(crate) struct RelSizeCache {
 }

 pub struct Timeline {
-    pub(crate) conf: &'static PageServerConf,
+    conf: &'static PageServerConf,
    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,

    myself: Weak<Self>,
@@ -867,11 +866,6 @@ impl Timeline {
            .map(|ancestor| ancestor.timeline_id)
    }

-    /// Get the ancestor timeline
-    pub(crate) fn ancestor_timeline(&self) -> Option<&Arc<Timeline>> {
-        self.ancestor_timeline.as_ref()
-    }
-
    /// Get the bytes written since the PITR cutoff on this branch, and
    /// whether this branch's ancestor_lsn is within its parent's PITR.
    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
@@ -1912,8 +1906,6 @@ impl Timeline {

            true
        } else if projected_layer_size >= checkpoint_distance {
-            // NB: this check is relied upon by:
-            let _ = IndexEntry::validate_checkpoint_distance;
            info!(
                "Will roll layer at {} with layer size {} due to layer size ({})",
                projected_lsn, layer_size, projected_layer_size
@@ -2241,11 +2233,6 @@ impl Timeline {

                handles: Default::default(),
            };
-
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1");
-            }
-
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

@@ -3009,10 +2996,7 @@ impl Timeline {
        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
        //   the layer is likely to be covered by an image layer during compaction.
        layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((
-                !LayerMap::is_l0(&desc.key_range, desc.is_delta),
-                desc.lsn_range.end,
-            ))
+            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
        });

        let layers = layers
@@ -3605,6 +3589,34 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

+            // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well?
+            // This code path will not be hit during regression tests. After #7099 we have a single partition
+            // with two key ranges. If someone wants to fix initdb optimization in the future, this might need
+            // to be fixed.
+
+            // For metadata, always create delta layers.
+            let delta_layer = if !metadata_partition.parts.is_empty() {
+                assert_eq!(
+                    metadata_partition.parts.len(),
+                    1,
+                    "currently sparse keyspace should only contain a single metadata keyspace"
+                );
+                let metadata_keyspace = &metadata_partition.parts[0];
+                self.create_delta_layer(
+                    &frozen_layer,
+                    Some(
+                        metadata_keyspace.0.ranges.first().unwrap().start
+                            ..metadata_keyspace.0.ranges.last().unwrap().end,
+                    ),
+                    ctx,
+                )
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            } else {
+                None
+            };
+
+            // For image layers, we add them immediately into the layer map.
            let mut layers_to_upload = Vec::new();
            layers_to_upload.extend(
                self.create_image_layers(
@@ -3615,27 +3627,13 @@ impl Timeline {
                )
                .await?,
            );
-            if !metadata_partition.parts.is_empty() {
-                assert_eq!(
-                    metadata_partition.parts.len(),
-                    1,
-                    "currently sparse keyspace should only contain a single metadata keyspace"
-                );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
-            }

-            (layers_to_upload, None)
+            if let Some(delta_layer) = delta_layer {
+                layers_to_upload.push(delta_layer.clone());
+                (layers_to_upload, Some(delta_layer))
+            } else {
+                (layers_to_upload, None)
+            }
        } else {
            // Normal case, write out a L0 delta layer file.
            // `create_delta_layer` will not modify the layer map.
@@ -4045,6 +4043,8 @@ impl Timeline {
        mode: ImageLayerCreationMode,
        start: Key,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::default();
        let data = self
@@ -4210,13 +4210,15 @@ impl Timeline {
                        "metadata keys must be partitioned separately"
                    );
                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
+                }
                if mode == ImageLayerCreationMode::Try && !check_for_image_layers {
                    // Skip compaction if there are not enough updates. Metadata compaction will do a scan and
                    // might mess up with evictions.
                    start = img_range.end;
                    continue;
                }
-                // For initial and force modes, we always generate image layers for metadata keys.
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
                // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
@@ -4224,8 +4226,7 @@ impl Timeline {
                    start = img_range.end;
                    continue;
                }
-            }
-            if let ImageLayerCreationMode::Force = mode {
+            } else if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
                let layers = self.layers.read().await;
@@ -4239,7 +4240,6 @@ impl Timeline {
                        img_range.start,
                        img_range.end
                    );
-                    start = img_range.end;
                    continue;
                }
            }
@@ -4595,7 +4595,7 @@ impl Timeline {
                // for compact_level0_phase1 creating an L0, which does not happen in practice
                // because we have not implemented L0 => L0 compaction.
                duplicated_layers.insert(l.layer_desc().key());
-            } else if LayerMap::is_l0(&l.layer_desc().key_range, l.layer_desc().is_delta) {
+            } else if LayerMap::is_l0(&l.layer_desc().key_range) {
                return Err(CompactionError::Other(anyhow::anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
            } else {
                insert_layers.push(l.clone());
@@ -5451,17 +5451,12 @@ impl Timeline {
                !(a.end <= b.start || b.end <= a.start)
            }

-            if deltas.key_range.start.next() != deltas.key_range.end {
-                let guard = self.layers.read().await;
-                let mut invalid_layers =
-                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
-                        layer.is_delta()
-                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                        && layer.lsn_range != deltas.lsn_range
-                        // skip single-key layer files
-                        && layer.key_range.start.next() != layer.key_range.end
-                    });
-                if let Some(layer) = invalid_layers.next() {
+            let guard = self.layers.read().await;
+            for layer in guard.layer_map()?.iter_historic_layers() {
+                if layer.is_delta()
+                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                    && layer.lsn_range != deltas.lsn_range
+                {
                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
                    panic!(
                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
@@ -5595,6 +5590,44 @@ enum OpenLayerAction {
 }

 impl<'a> TimelineWriter<'a> {
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    pub(crate) async fn put(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Avoid doing allocations for "small" values.
+        // In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
+        // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
+        let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
+        value.ser_into(&mut buf)?;
+        let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
+
+        let action = self.get_open_layer_action(lsn, buf_size);
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
+        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
+
+        if res.is_ok() {
+            // Update the current size only when the entire write was ok.
+            // In case of failures, we may have had partial writes which
+            // render the size tracking out of sync. That's ok because
+            // the checkpoint distance should be significantly smaller
+            // than the S3 single shot upload limit of 5GiB.
+            let state = self.write_guard.as_mut().unwrap();
+
+            state.current_size += buf_size;
+            state.prev_lsn = Some(lsn);
+            state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn));
+        }
+
+        res
+    }
+
    async fn handle_open_layer_action(
        &mut self,
        at: Lsn,
@@ -5700,64 +5733,18 @@ impl<'a> TimelineWriter<'a> {
    }

    /// Put a batch of keys at the specified Lsns.
+    ///
+    /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`].
    pub(crate) async fn put_batch(
        &mut self,
-        batch: Vec<(CompactKey, Lsn, usize, Value)>,
+        batch: VecMap<Lsn, (Key, Value)>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
-            return Ok(());
+        for (lsn, (key, val)) in batch {
+            self.put(key, lsn, &val, ctx).await?
        }

-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
-
-        let action = self.get_open_layer_action(batch_max_lsn, buf_size);
-        let layer = self
-            .handle_open_layer_action(batch_max_lsn, action, ctx)
-            .await?;
-
-        let res = layer.put_batch(serialized_batch, ctx).await;
-
-        if res.is_ok() {
-            // Update the current size only when the entire write was ok.
-            // In case of failures, we may have had partial writes which
-            // render the size tracking out of sync. That's ok because
-            // the checkpoint distance should be significantly smaller
-            // than the S3 single shot upload limit of 5GiB.
-            let state = self.write_guard.as_mut().unwrap();
-
-            state.current_size += buf_size;
-            state.prev_lsn = Some(batch_max_lsn);
-            state.max_lsn = std::cmp::max(state.max_lsn, Some(batch_max_lsn));
-        }
-
-        res
-    }
-
-    #[cfg(test)]
-    /// Test helper, for tests that would like to poke individual values without composing a batch
-    pub(crate) async fn put(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        value: &Value,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        use utils::bin_ser::BeSer;
-        if !key.is_valid_key_on_write_path() {
-            bail!(
-                "the request contains data not supported by pageserver at TimelineWriter::put: {}",
-                key
-            );
-        }
-        let val_ser_size = value.serialized_size().unwrap() as usize;
-        self.put_batch(
-            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
-            ctx,
-        )
-        .await
+        Ok(())
    }

    pub(crate) async fn delete_batch(
@@ -5898,7 +5885,7 @@ mod tests {
            };

            // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range(), layer.name.is_delta()) {
+            if !LayerMap::is_l0(layer.name.key_range()) {
                assert!(layer_lsn <= last_lsn);
                last_lsn = layer_lsn;
            }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -14,7 +14,7 @@ use super::{
    RecordedDuration, Timeline,
 };

-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, Context};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
@@ -32,9 +32,6 @@ use crate::page_cache;
 use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::split_writer::{
-    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
-};
 use crate::tenant::storage_layer::{
    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -74,60 +71,15 @@ pub(crate) struct KeyHistoryRetention {
 }

 impl KeyHistoryRetention {
-    /// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
-    ///
-    /// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
-    /// For example, consider the case where a single delta with range [0x10,0x50) exists.
-    /// And we have branches at LSN 0x10, 0x20, 0x30.
-    /// Then we delete branch @ 0x20.
-    /// Bottom-most compaction may now delete the delta [0x20,0x30).
-    /// And that wouldnt' change the shape of the layer.
-    ///
-    /// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
-    ///
-    /// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
-    async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
-        if dry_run {
-            return true;
-        }
-        let guard = tline.layers.read().await;
-        if !guard.contains_key(key) {
-            return false;
-        }
-        let layer_generation = guard.get_from_key(key).metadata().generation;
-        drop(guard);
-        if layer_generation == tline.generation {
-            info!(
-                key=%key,
-                ?layer_generation,
-                "discard layer due to duplicated layer key in the same generation",
-            );
-            true
-        } else {
-            false
-        }
-    }
-
-    /// Pipe a history of a single key to the writers.
-    ///
-    /// If `image_writer` is none, the images will be placed into the delta layers.
-    /// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
-    #[allow(clippy::too_many_arguments)]
    async fn pipe_to(
        self,
        key: Key,
-        tline: &Arc<Timeline>,
-        delta_writer: &mut SplitDeltaLayerWriter,
-        mut image_writer: Option<&mut SplitImageLayerWriter>,
+        delta_writer: &mut Vec<(Key, Lsn, Value)>,
+        mut image_writer: Option<&mut ImageLayerWriter>,
        stat: &mut CompactionStatistics,
-        dry_run: bool,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
-        let discard = |key: &PersistentLayerKey| {
-            let key = key.clone();
-            async move { Self::discard_key(&key, tline, dry_run).await }
-        };
        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
@@ -136,45 +88,28 @@ impl KeyHistoryRetention {
                    };
                    stat.produce_image_key(img);
                    if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer
-                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
-                            .await?;
+                        image_writer.put_image(key, img.clone(), ctx).await?;
                    } else {
-                        delta_writer
-                            .put_value_with_discard_fn(
-                                key,
-                                cutoff_lsn,
-                                Value::Image(img.clone()),
-                                tline,
-                                ctx,
-                                discard,
-                            )
-                            .await?;
+                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
                    }
                } else {
                    for (lsn, val) in logs {
                        stat.produce_key(&val);
-                        delta_writer
-                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                            .await?;
+                        delta_writer.push((key, lsn, val));
                    }
                }
                first_batch = false;
            } else {
                for (lsn, val) in logs {
                    stat.produce_key(&val);
-                    delta_writer
-                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                        .await?;
+                    delta_writer.push((key, lsn, val));
                }
            }
        }
        let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
        for (lsn, val) in above_horizon_logs {
            stat.produce_key(&val);
-            delta_writer
-                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                .await?;
+            delta_writer.push((key, lsn, val));
        }
        Ok(())
    }
@@ -1879,27 +1814,11 @@ impl Timeline {
            }
            let mut selected_layers = Vec::new();
            drop(gc_info);
-            // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
-            let Some(max_layer_lsn) = layers
-                .iter_historic_layers()
-                .filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
-                .map(|desc| desc.get_lsn_range().end)
-                .max()
-            else {
-                info!("no layers to compact with gc");
-                return Ok(());
-            };
-            // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
-            // layers to compact.
            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn {
+                if desc.get_lsn_range().start <= gc_cutoff {
                    selected_layers.push(guard.get_from_desc(&desc));
                }
            }
-            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
-                return Ok(());
-            }
            retain_lsns_below_horizon.sort();
            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
        };
@@ -1929,53 +1848,27 @@ impl Timeline {
            lowest_retain_lsn
        );
        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+        // Also, collect the layer information to decide when to split the new delta layers.
+        let mut downloaded_layers = Vec::new();
+        let mut delta_split_points = BTreeSet::new();
        for layer in &layer_selection {
+            let resident_layer = layer.download_and_keep_resident().await?;
+            downloaded_layers.push(resident_layer);
+
            let desc = layer.layer_desc();
            if desc.is_delta() {
-                // ignore single-key layer files
-                if desc.key_range.start.next() != desc.key_range.end {
-                    let lsn_range = &desc.lsn_range;
-                    lsn_split_point.insert(lsn_range.start);
-                    lsn_split_point.insert(lsn_range.end);
-                }
+                // TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
+                // so that we can avoid having too many small delta layers.
+                let key_range = desc.get_key_range();
+                delta_split_points.insert(key_range.start);
+                delta_split_points.insert(key_range.end);
                stat.visit_delta_layer(desc.file_size());
            } else {
                stat.visit_image_layer(desc.file_size());
            }
        }
-        for layer in &layer_selection {
-            let desc = layer.layer_desc();
-            let key_range = &desc.key_range;
-            if desc.is_delta() && key_range.start.next() != key_range.end {
-                let lsn_range = desc.lsn_range.clone();
-                let intersects = lsn_split_point.range(lsn_range).collect_vec();
-                if intersects.len() > 1 {
-                    bail!(
-                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        desc.key(),
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-                }
-            }
-        }
-        // The maximum LSN we are processing in this compaction loop
-        let end_lsn = layer_selection
-            .iter()
-            .map(|l| l.layer_desc().lsn_range.end)
-            .max()
-            .unwrap();
-        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
-        // as an L0 layer.
-        let hack_end_key = Key::NON_L0_MAX;
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
-        let mut downloaded_layers = Vec::new();
-        for layer in &layer_selection {
-            let resident_layer = layer.download_and_keep_resident().await?;
-            downloaded_layers.push(resident_layer);
-        }
        for resident_layer in &downloaded_layers {
            if resident_layer.layer_desc().is_delta() {
                let layer = resident_layer.get_as_delta(ctx).await?;
@@ -1991,17 +1884,138 @@ impl Timeline {
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

+        enum FlushDeltaResult {
+            /// Create a new resident layer
+            CreateResidentLayer(ResidentLayer),
+            /// Keep an original delta layer
+            KeepLayer(PersistentLayerKey),
+        }
+
+        #[allow(clippy::too_many_arguments)]
+        async fn flush_deltas(
+            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
+            last_key: Key,
+            delta_split_points: &[Key],
+            current_delta_split_point: &mut usize,
+            tline: &Arc<Timeline>,
+            lowest_retain_lsn: Lsn,
+            ctx: &RequestContext,
+            stats: &mut CompactionStatistics,
+            dry_run: bool,
+            last_batch: bool,
+        ) -> anyhow::Result<Option<FlushDeltaResult>> {
+            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
+            // overlapping layers.
+            //
+            // If we have a structure like this:
+            //
+            // | Delta 1 |         | Delta 4 |
+            // |---------| Delta 2 |---------|
+            // | Delta 3 |         | Delta 5 |
+            //
+            // And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
+            // A simple solution here is to split the delta layers using the original boundary, while this
+            // might produce a lot of small layers. This should be improved and fixed in the future.
+            let mut need_split = false;
+            while *current_delta_split_point < delta_split_points.len()
+                && last_key >= delta_split_points[*current_delta_split_point]
+            {
+                *current_delta_split_point += 1;
+                need_split = true;
+            }
+            if !need_split && !last_batch {
+                return Ok(None);
+            }
+            let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
+            if deltas.is_empty() {
+                return Ok(None);
+            }
+            let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
+            let delta_key = PersistentLayerKey {
+                key_range: {
+                    let key_start = deltas.first().unwrap().0;
+                    let key_end = deltas.last().unwrap().0.next();
+                    key_start..key_end
+                },
+                lsn_range: lowest_retain_lsn..end_lsn,
+                is_delta: true,
+            };
+            {
+                // Hack: skip delta layer if we need to produce a layer of a same key-lsn.
+                //
+                // This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
+                // For example, consider the case where a single delta with range [0x10,0x50) exists.
+                // And we have branches at LSN 0x10, 0x20, 0x30.
+                // Then we delete branch @ 0x20.
+                // Bottom-most compaction may now delete the delta [0x20,0x30).
+                // And that wouldnt' change the shape of the layer.
+                //
+                // Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
+                // That's why it's safe to skip.
+                let guard = tline.layers.read().await;
+
+                if guard.contains_key(&delta_key) {
+                    let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
+                    drop(guard);
+                    if layer_generation == tline.generation {
+                        stats.discard_delta_layer();
+                        // TODO: depending on whether we design this compaction process to run along with
+                        // other compactions, there could be layer map modifications after we drop the
+                        // layer guard, and in case it creates duplicated layer key, we will still error
+                        // in the end.
+                        info!(
+                            key=%delta_key,
+                            ?layer_generation,
+                            "discard delta layer due to duplicated layer in the same generation"
+                        );
+                        return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
+                    }
+                }
+            }
+
+            let mut delta_layer_writer = DeltaLayerWriter::new(
+                tline.conf,
+                tline.timeline_id,
+                tline.tenant_shard_id,
+                delta_key.key_range.start,
+                lowest_retain_lsn..end_lsn,
+                ctx,
+            )
+            .await?;
+            for (key, lsn, val) in deltas {
+                delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+            }
+
+            stats.produce_delta_layer(delta_layer_writer.size());
+            if dry_run {
+                return Ok(None);
+            }
+
+            let (desc, path) = delta_layer_writer
+                .finish(delta_key.key_range.end, ctx)
+                .await?;
+            let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
+            Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
+        }
+
+        // Hack the key range to be min..(max-1). Otherwise, the image layer will be
+        // interpreted as an L0 delta layer.
+        let hack_image_layer_range = {
+            let mut end_key = Key::MAX;
+            end_key.field6 -= 1;
+            Key::MIN..end_key
+        };
+
        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
        // when some condition meet.
        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
            Some(
-                SplitImageLayerWriter::new(
+                ImageLayerWriter::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_shard_id,
-                    Key::MIN,
+                    &hack_image_layer_range, // covers the full key range
                    lowest_retain_lsn,
-                    self.get_compaction_target_size(),
                    ctx,
                )
                .await?,
@@ -2010,17 +2024,6 @@ impl Timeline {
            None
        };

-        let mut delta_layer_writer = SplitDeltaLayerWriter::new(
-            self.conf,
-            self.timeline_id,
-            self.tenant_shard_id,
-            Key::MIN,
-            lowest_retain_lsn..end_lsn,
-            self.get_compaction_target_size(),
-            ctx,
-        )
-        .await?;
-
        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
        ///
        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2041,11 +2044,47 @@ impl Timeline {
            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
            Ok(Some((key, tline.ancestor_lsn, img)))
        }
+        let image_layer_key = PersistentLayerKey {
+            key_range: hack_image_layer_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
+            is_delta: false,
+        };
+
+        // Like with delta layers, it can happen that we re-produce an already existing image layer.
+        // This could happen when a user triggers force compaction and image generation. In this case,
+        // it's always safe to rewrite the layer.
+        let discard_image_layer = {
+            let guard = self.layers.read().await;
+            if guard.contains_key(&image_layer_key) {
+                let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
+                drop(guard);
+                if layer_generation == self.generation {
+                    // TODO: depending on whether we design this compaction process to run along with
+                    // other compactions, there could be layer map modifications after we drop the
+                    // layer guard, and in case it creates duplicated layer key, we will still error
+                    // in the end.
+                    info!(
+                        key=%image_layer_key,
+                        ?layer_generation,
+                        "discard image layer due to duplicated layer key in the same generation",
+                    );
+                    true
+                } else {
+                    false
+                }
+            } else {
+                false
+            }
+        };

        // Actually, we can decide not to write to the image layer at all at this point because
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

+        let mut delta_values = Vec::new();
+        let delta_split_points = delta_split_points.into_iter().collect_vec();
+        let mut current_delta_split_point = 0;
+        let mut delta_layers = Vec::new();
        while let Some((key, lsn, val)) = merge_iter.next().await? {
            if cancel.is_cancelled() {
                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
@@ -2076,14 +2115,27 @@ impl Timeline {
                retention
                    .pipe_to(
                        *last_key,
-                        self,
-                        &mut delta_layer_writer,
+                        &mut delta_values,
                        image_layer_writer.as_mut(),
                        &mut stat,
-                        dry_run,
                        ctx,
                    )
                    .await?;
+                delta_layers.extend(
+                    flush_deltas(
+                        &mut delta_values,
+                        *last_key,
+                        &delta_split_points,
+                        &mut current_delta_split_point,
+                        self,
+                        lowest_retain_lsn,
+                        ctx,
+                        &mut stat,
+                        dry_run,
+                        false,
+                    )
+                    .await?,
+                );
                accumulated_values.clear();
                *last_key = key;
                accumulated_values.push((key, lsn, val));
@@ -2107,75 +2159,43 @@ impl Timeline {
        retention
            .pipe_to(
                last_key,
-                self,
-                &mut delta_layer_writer,
+                &mut delta_values,
                image_layer_writer.as_mut(),
                &mut stat,
-                dry_run,
                ctx,
            )
            .await?;
+        delta_layers.extend(
+            flush_deltas(
+                &mut delta_values,
+                last_key,
+                &delta_split_points,
+                &mut current_delta_split_point,
+                self,
+                lowest_retain_lsn,
+                ctx,
+                &mut stat,
+                dry_run,
+                true,
+            )
+            .await?,
+        );
+        assert!(delta_values.is_empty(), "unprocessed keys");

-        let discard = |key: &PersistentLayerKey| {
-            let key = key.clone();
-            async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
-        };
-
-        let produced_image_layers = if let Some(writer) = image_layer_writer {
+        let image_layer = if discard_image_layer {
+            stat.discard_image_layer();
+            None
+        } else if let Some(writer) = image_layer_writer {
+            stat.produce_image_layer(writer.size());
            if !dry_run {
-                writer
-                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
-                    .await?
+                Some(writer.finish(self, ctx).await?)
            } else {
-                let (layers, _) = writer.take()?;
-                assert!(layers.is_empty(), "image layers produced in dry run mode?");
-                Vec::new()
+                None
            }
        } else {
-            Vec::new()
+            None
        };

-        let produced_delta_layers = if !dry_run {
-            delta_layer_writer
-                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
-                .await?
-        } else {
-            let (layers, _) = delta_layer_writer.take()?;
-            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
-            Vec::new()
-        };
-
-        let mut compact_to = Vec::new();
-        let mut keep_layers = HashSet::new();
-        let produced_delta_layers_len = produced_delta_layers.len();
-        let produced_image_layers_len = produced_image_layers.len();
-        for action in produced_delta_layers {
-            match action {
-                SplitWriterResult::Produced(layer) => {
-                    stat.produce_delta_layer(layer.layer_desc().file_size());
-                    compact_to.push(layer);
-                }
-                SplitWriterResult::Discarded(l) => {
-                    keep_layers.insert(l);
-                    stat.discard_delta_layer();
-                }
-            }
-        }
-        for action in produced_image_layers {
-            match action {
-                SplitWriterResult::Produced(layer) => {
-                    stat.produce_image_layer(layer.layer_desc().file_size());
-                    compact_to.push(layer);
-                }
-                SplitWriterResult::Discarded(l) => {
-                    keep_layers.insert(l);
-                    stat.discard_image_layer();
-                }
-            }
-        }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-
        info!(
            "gc-compaction statistics: {}",
            serde_json::to_string(&stat)?
@@ -2186,11 +2206,28 @@ impl Timeline {
        }

        info!(
-            "produced {} delta layers and {} image layers, {} layers are kept",
-            produced_delta_layers_len,
-            produced_image_layers_len,
-            layer_selection.len()
+            "produced {} delta layers and {} image layers",
+            delta_layers.len(),
+            if image_layer.is_some() { 1 } else { 0 }
        );
+        let mut compact_to = Vec::new();
+        let mut keep_layers = HashSet::new();
+        for action in delta_layers {
+            match action {
+                FlushDeltaResult::CreateResidentLayer(layer) => {
+                    compact_to.push(layer);
+                }
+                FlushDeltaResult::KeepLayer(l) => {
+                    keep_layers.insert(l);
+                }
+            }
+        }
+        if discard_image_layer {
+            keep_layers.insert(image_layer_key);
+        }
+        let mut layer_selection = layer_selection;
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+        compact_to.extend(image_layer);

        // Step 3: Place back to the layer map.
        {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -27,8 +27,8 @@ use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
-    pgdatadir_mapping::DatadirModification,
-    task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::WALRECEIVER_RUNTIME,
    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
@@ -345,10 +345,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // Commit every ingest_batch_size records. Even if we filtered out
                        // all records, we still need to call commit to advance the LSN.
                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size
-                            || modification.approx_pending_bytes()
-                                > DatadirModification::MAX_PENDING_BYTES
-                        {
+                        if uncommitted_records >= ingest_batch_size {
                            WAL_INGEST
                                .records_committed
                                .inc_by(uncommitted_records - filtered_records);
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -27,7 +27,7 @@ use utils::vec_map::VecMap;

 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::VirtualFile;

 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);
@@ -60,7 +60,7 @@ pub struct VectoredBlobsBuf {
 pub struct VectoredRead {
    pub start: u64,
    pub end: u64,
-    /// Start offset and metadata for each blob in this read
+    /// Starting offsets and metadata for each blob in this read
    pub blobs_at: VecMap<u64, BlobMeta>,
 }

@@ -76,109 +76,14 @@ pub(crate) enum VectoredReadExtended {
    No,
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum VectoredReadCoalesceMode {
-    /// Only coalesce exactly adjacent reads.
-    AdjacentOnly,
-    /// In addition to adjacent reads, also consider reads whose corresponding
-    /// `end` and `start` offsets reside at the same chunk.
-    Chunked(usize),
-}
-
-impl VectoredReadCoalesceMode {
-    /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
-    /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
-    pub(crate) fn get() -> Self {
-        let align = virtual_file::get_io_buffer_alignment_raw();
-        if align == 0 {
-            VectoredReadCoalesceMode::AdjacentOnly
-        } else {
-            VectoredReadCoalesceMode::Chunked(align)
-        }
-    }
-}
-
-pub(crate) enum VectoredReadBuilder {
-    Adjacent(AdjacentVectoredReadBuilder),
-    Chunked(ChunkedVectoredReadBuilder),
-}
-
-impl VectoredReadBuilder {
-    fn new_impl(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        match mode {
-            VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
-                AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
-            ),
-            VectoredReadCoalesceMode::Chunked(chunk_size) => {
-                Self::Chunked(ChunkedVectoredReadBuilder::new(
-                    start_offset,
-                    end_offset,
-                    meta,
-                    max_read_size,
-                    chunk_size,
-                ))
-            }
-        }
-    }
-
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: usize,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
-    }
-
-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        mode: VectoredReadCoalesceMode,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, mode)
-    }
-
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
-            VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
-        }
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.build(),
-            VectoredReadBuilder::Chunked(builder) => builder.build(),
-        }
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        match self {
-            VectoredReadBuilder::Adjacent(builder) => builder.size(),
-            VectoredReadBuilder::Chunked(builder) => builder.size(),
-        }
-    }
-}
-
-pub(crate) struct AdjacentVectoredReadBuilder {
-    /// Start offset of the read.
+pub(crate) struct VectoredReadBuilder {
    start: u64,
-    // End offset of the read.
    end: u64,
-    /// Start offset and metadata for each blob in this read
    blobs_at: VecMap<u64, BlobMeta>,
    max_read_size: Option<usize>,
 }

-impl AdjacentVectoredReadBuilder {
+impl VectoredReadBuilder {
    /// Start building a new vectored read.
    ///
    /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -188,7 +93,7 @@ impl AdjacentVectoredReadBuilder {
        start_offset: u64,
        end_offset: u64,
        meta: BlobMeta,
-        max_read_size: Option<usize>,
+        max_read_size: usize,
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
@@ -199,7 +104,7 @@ impl AdjacentVectoredReadBuilder {
            start: start_offset,
            end: end_offset,
            blobs_at,
-            max_read_size,
+            max_read_size: Some(max_read_size),
        }
    }
    /// Attempt to extend the current read with a new blob if the start
@@ -208,15 +113,13 @@ impl AdjacentVectoredReadBuilder {
    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
        tracing::trace!(start, end, "trying to extend");
        let size = (end - start) as usize;
-        let not_limited_by_max_read_size = {
+        if self.end == start && {
            if let Some(max_read_size) = self.max_read_size {
                self.size() + size <= max_read_size
            } else {
                true
            }
-        };
-
-        if self.end == start && not_limited_by_max_read_size {
+        } {
            self.end = end;
            self.blobs_at
                .append(start, meta)
@@ -241,107 +144,6 @@ impl AdjacentVectoredReadBuilder {
    }
 }

-pub(crate) struct ChunkedVectoredReadBuilder {
-    /// Start block number
-    start_blk_no: usize,
-    /// End block number (exclusive).
-    end_blk_no: usize,
-    /// Start offset and metadata for each blob in this read
-    blobs_at: VecMap<u64, BlobMeta>,
-    max_read_size: Option<usize>,
-    /// Chunk size reads are coalesced into.
-    chunk_size: usize,
-}
-
-/// Computes x / d rounded up.
-fn div_round_up(x: usize, d: usize) -> usize {
-    (x + (d - 1)) / d
-}
-
-impl ChunkedVectoredReadBuilder {
-    /// Start building a new vectored read.
-    ///
-    /// Note that by design, this does not check against reading more than `max_read_size` to
-    /// support reading larger blobs than the configuration value. The builder will be single use
-    /// however after that.
-    pub(crate) fn new(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        max_read_size: Option<usize>,
-        chunk_size: usize,
-    ) -> Self {
-        let mut blobs_at = VecMap::default();
-        blobs_at
-            .append(start_offset, meta)
-            .expect("First insertion always succeeds");
-
-        let start_blk_no = start_offset as usize / chunk_size;
-        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
-        Self {
-            start_blk_no,
-            end_blk_no,
-            blobs_at,
-            max_read_size,
-            chunk_size,
-        }
-    }
-
-    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
-    ///
-    /// The resulting size also must be below the max read size.
-    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
-        tracing::trace!(start, end, "trying to extend");
-        let start_blk_no = start as usize / self.chunk_size;
-        let end_blk_no = div_round_up(end as usize, self.chunk_size);
-
-        let not_limited_by_max_read_size = {
-            if let Some(max_read_size) = self.max_read_size {
-                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
-                coalesced_size <= max_read_size
-            } else {
-                true
-            }
-        };
-
-        // True if the second block starts in the same block or the immediate next block where the first block ended.
-        //
-        // Note: This automatically handles the case where two blocks are adjacent to each other,
-        // whether they starts on chunk size boundary or not.
-        let is_adjacent_chunk_read = {
-            // 1. first.end & second.start are in the same block
-            self.end_blk_no == start_blk_no + 1 ||
-            // 2. first.end ends one block before second.start
-            self.end_blk_no == start_blk_no
-        };
-
-        if is_adjacent_chunk_read && not_limited_by_max_read_size {
-            self.end_blk_no = end_blk_no;
-            self.blobs_at
-                .append(start, meta)
-                .expect("LSNs are ordered within vectored reads");
-
-            return VectoredReadExtended::Yes;
-        }
-
-        VectoredReadExtended::No
-    }
-
-    pub(crate) fn size(&self) -> usize {
-        (self.end_blk_no - self.start_blk_no) * self.chunk_size
-    }
-
-    pub(crate) fn build(self) -> VectoredRead {
-        let start = (self.start_blk_no * self.chunk_size) as u64;
-        let end = (self.end_blk_no * self.chunk_size) as u64;
-        VectoredRead {
-            start,
-            end,
-            blobs_at: self.blobs_at,
-        }
-    }
-}
-
 #[derive(Copy, Clone, Debug)]
 pub enum BlobFlag {
    None,
@@ -364,18 +166,14 @@ pub struct VectoredReadPlanner {
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

    max_read_size: usize,
-
-    mode: VectoredReadCoalesceMode,
 }

 impl VectoredReadPlanner {
    pub fn new(max_read_size: usize) -> Self {
-        let mode = VectoredReadCoalesceMode::get();
        Self {
            blobs: BTreeMap::new(),
            prev: None,
            max_read_size,
-            mode,
        }
    }

@@ -454,7 +252,6 @@ impl VectoredReadPlanner {
                        end_offset,
                        BlobMeta { key, lsn },
                        self.max_read_size,
-                        self.mode,
                    );

                    let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -506,18 +303,6 @@ impl<'a> VectoredBlobReader<'a> {
            read.size(),
            buf.capacity()
        );
-
-        if cfg!(debug_assertions) {
-            let align = virtual_file::get_io_buffer_alignment() as u64;
-            debug_assert_eq!(
-                read.start % align,
-                0,
-                "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
-                read.start,
-                align
-            );
-        }
-
        let mut buf = self
            .file
            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
@@ -525,20 +310,27 @@ impl<'a> VectoredBlobReader<'a> {
            .into_inner();

        let blobs_at = read.blobs_at.as_slice();
-
-        let start_offset = read.start;
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;

        let mut metas = Vec::with_capacity(blobs_at.len());
+
        // Blobs in `read` only provide their starting offset. The end offset
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );

        // Some scratch space, put here for reusing the allocation
        let mut decompressed_vec = Vec::new();

-        for (blob_start, meta) in blobs_at {
-            let blob_start_in_buf = blob_start - start_offset;
-            let first_len_byte = buf[blob_start_in_buf as usize];
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];

            // Each blob is prefixed by a header containing its size and compression information.
            // Extract the size and skip that header to find the start of the data.
@@ -548,7 +340,7 @@ impl<'a> VectoredBlobReader<'a> {
                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
            } else {
                let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = blob_start_in_buf as usize;
+                let offset_in_buf = offset_in_buf as usize;

                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
@@ -561,8 +353,12 @@ impl<'a> VectoredBlobReader<'a> {
                )
            };

-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
+            let start_raw = offset_in_buf + size_length;
+            let end_raw = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start_raw + blob_size,
+            };
+            assert_eq!(end_raw - start_raw, blob_size);
            let (start, end);
            if compression_bits == BYTE_UNCOMPRESSED {
                start = start_raw as usize;
@@ -611,22 +407,18 @@ pub struct StreamingVectoredReadPlanner {
    max_cnt: usize,
    /// Size of the current batch
    cnt: usize,
-
-    mode: VectoredReadCoalesceMode,
 }

 impl StreamingVectoredReadPlanner {
    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
-        let mode = VectoredReadCoalesceMode::get();
        Self {
            read_builder: None,
            prev: None,
            max_cnt,
            max_read_size,
            cnt: 0,
-            mode,
        }
    }

@@ -675,12 +467,17 @@ impl StreamingVectoredReadPlanner {
            }
            None => {
                self.read_builder = {
-                    Some(VectoredReadBuilder::new_streaming(
-                        start_offset,
-                        end_offset,
-                        BlobMeta { key, lsn },
-                        self.mode,
-                    ))
+                    let mut blobs_at = VecMap::default();
+                    blobs_at
+                        .append(start_offset, BlobMeta { key, lsn })
+                        .expect("First insertion always succeeds");
+
+                    Some(VectoredReadBuilder {
+                        start: start_offset,
+                        end: end_offset,
+                        blobs_at,
+                        max_read_size: None,
+                    })
                };
            }
        }
@@ -714,9 +511,7 @@ mod tests {
    use super::*;

    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        let align = virtual_file::get_io_buffer_alignment() as u64;
-        assert_eq!(read.start % align, 0);
-        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
+        assert_eq!(read.start, offset_range.first().unwrap().2);

        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();

@@ -730,68 +525,6 @@ mod tests {
        assert_eq!(expected_offsets_in_read, offsets_in_read);
    }

-    #[test]
-    fn planner_chunked_coalesce_all_test() {
-        use crate::virtual_file;
-
-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
-
-        // The test explicitly does not check chunk size < 512
-        if chunk_size < 512 {
-            return;
-        }
-
-        let max_read_size = chunk_size as usize * 8;
-        let key = Key::MIN;
-        let lsn = Lsn(0);
-
-        let blob_descriptions = [
-            (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size / 2, BlobFlag::None),
-            (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size, BlobFlag::None),
-            (key, lsn, chunk_size * 2 - 1, BlobFlag::None),
-            (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size * 3 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 5 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, chunk_size * 7 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
-        ];
-
-        let ranges = [
-            &[
-                blob_descriptions[0],
-                blob_descriptions[2],
-                blob_descriptions[4],
-                blob_descriptions[5],
-                blob_descriptions[7],
-                blob_descriptions[8],
-                blob_descriptions[10],
-            ],
-            &blob_descriptions[11..12],
-            &blob_descriptions[13..],
-        ];
-
-        let mut planner = VectoredReadPlanner::new(max_read_size);
-        for (key, lsn, offset, flag) in blob_descriptions {
-            planner.handle(key, lsn, offset, flag);
-        }
-
-        planner.handle_range_end(652 * 1024);
-
-        let reads = planner.finish();
-
-        assert_eq!(reads.len(), ranges.len());
-
-        for (idx, read) in reads.iter().enumerate() {
-            validate_read(read, ranges[idx]);
-        }
-    }
-
    #[test]
    fn planner_max_read_size_test() {
        let max_read_size = 128 * 1024;
@@ -838,19 +571,18 @@ mod tests {

    #[test]
    fn planner_replacement_test() {
-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
-        let max_read_size = 128 * chunk_size as usize;
+        let max_read_size = 128 * 1024;
        let first_key = Key::MIN;
        let second_key = first_key.next();
        let lsn = Lsn(0);

        let blob_descriptions = vec![
-            (first_key, lsn, 0, BlobFlag::None),          // First in read 1
-            (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll),
-            (second_key, lsn, 3 * chunk_size, BlobFlag::None),
-            (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2
-            (second_key, lsn, 5 * chunk_size, BlobFlag::None),       // Last in read 2
+            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
+            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll),
+            (second_key, lsn, 3 * 1024, BlobFlag::None),
+            (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),       // Last in read 2
        ];

        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
@@ -860,7 +592,7 @@ mod tests {
            planner.handle(key, lsn, offset, flag);
        }

-        planner.handle_range_end(6 * chunk_size);
+        planner.handle_range_end(6 * 1024);

        let reads = planner.finish();
        assert_eq!(reads.len(), 2);
@@ -1005,7 +737,6 @@ mod tests {
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
        let mut buf = BytesMut::with_capacity(reserved_bytes);

-        let mode = VectoredReadCoalesceMode::get();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
        let meta = BlobMeta {
            key: Key::MIN,
@@ -1017,7 +748,7 @@ mod tests {
            if idx + 1 == offsets.len() {
                continue;
            }
-            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
+            let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
            let read = read_builder.build();
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
@@ -1053,12 +784,4 @@ mod tests {
        round_trip_test_compressed(&blobs, true).await?;
        Ok(())
    }
-
-    #[test]
-    fn test_div_round_up() {
-        const CHUNK_SIZE: usize = 512;
-        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
-        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
-        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
-    }
 }
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -9,7 +9,7 @@ use utils::serde_percent::Percent;

 use pageserver_api::models::PageserverUtilization;

-use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};
+use crate::{config::PageServerConf, tenant::mgr::TenantManager};

 pub(crate) fn regenerate(
    conf: &PageServerConf,
@@ -58,13 +58,13 @@ pub(crate) fn regenerate(
        disk_usable_pct,
        shard_count,
        max_shard_count: MAX_SHARDS,
-        utilization_score: None,
+        utilization_score: 0,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };

-    // Initialize `PageserverUtilization::utilization_score`
-    let score = doc.cached_score();
-    NODE_UTILIZATION_SCORE.set(score);
+    doc.refresh_score();
+
+    // TODO: make utilization_score into a metric

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,6 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};

@@ -1141,13 +1140,10 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
+pub fn init(num_slots: usize, engine: IoEngineKind) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
-    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
-    }
    io_engine::init(engine);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1171,53 +1167,6 @@ fn get_open_files() -> &'static OpenFiles {
    }
 }

-static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
-
-/// Returns true if `x` is zero or a power of two.
-fn is_zero_or_power_of_two(x: usize) -> bool {
-    (x == 0) || ((x & (x - 1)) == 0)
-}
-
-#[allow(unused)]
-pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_zero_or_power_of_two(align) {
-        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
-        Ok(())
-    } else {
-        Err(align)
-    }
-}
-
-/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
-///
-/// This function should be used to check the raw config value.
-pub(crate) fn get_io_buffer_alignment_raw() -> usize {
-    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
-
-    if cfg!(test) {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if let Some(test_align) = utils::env::var(env_var_name) {
-            if is_zero_or_power_of_two(test_align) {
-                test_align
-            } else {
-                panic!("IO buffer alignment ({test_align}) is not a power of two");
-            }
-        } else {
-            align
-        }
-    } else {
-        align
-    }
-}
-
-/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = get_io_buffer_alignment_raw();
-    align.max(1)
-}
-
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -78,7 +78,6 @@ where
            .expect("must not use after we returned an error")
    }

-    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
    #[cfg_attr(target_os = "macos", allow(dead_code))]
    pub async fn write_buffered<S: IoBuf + Send>(
        &mut self,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,25 +21,19 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

-use std::time::Duration;
-use std::time::SystemTime;
-
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::TimestampTz;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 use utils::failpoint_support;
-use utils::rate_limit::RateLimit;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::{DatadirModification, Version};
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
@@ -59,13 +53,6 @@ pub struct WalIngest {
    shard: ShardIdentity,
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
-    warn_ingest_lag: WarnIngestLag,
-}
-
-struct WarnIngestLag {
-    lag_msg_ratelimit: RateLimit,
-    future_lsn_msg_ratelimit: RateLimit,
-    timestamp_invalid_msg_ratelimit: RateLimit,
 }

 impl WalIngest {
@@ -84,11 +71,6 @@ impl WalIngest {
            shard: *timeline.get_shard_identity(),
            checkpoint,
            checkpoint_modified: false,
-            warn_ingest_lag: WarnIngestLag {
-                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
-                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
-                timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
-            },
        })
    }

@@ -1230,48 +1212,6 @@ impl WalIngest {
        Ok(())
    }

-    fn warn_on_ingest_lag(
-        &mut self,
-        conf: &crate::config::PageServerConf,
-        wal_timestmap: TimestampTz,
-    ) {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let now = SystemTime::now();
-        let rate_limits = &mut self.warn_ingest_lag;
-        match try_from_pg_timestamp(wal_timestmap) {
-            Ok(ts) => {
-                match now.duration_since(ts) {
-                    Ok(lag) => {
-                        if lag > conf.wait_lsn_timeout {
-                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
-                                let lag = humantime::format_duration(lag);
-                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
-                            })
-                        }
-                    },
-                    Err(e) => {
-                        let delta_t = e.duration();
-                        // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
-                        // => https://www.robustperception.io/time-metric-from-the-node-exporter/
-                        const IGNORED_DRIFT: Duration = Duration::from_millis(100);
-                        if delta_t > IGNORED_DRIFT {
-                            let delta_t = humantime::format_duration(delta_t);
-                            rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| {
-                                warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future");
-                            })
-                        }
-                    }
-                };
-
-            }
-            Err(error) => {
-                rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
-                    warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries");
-                })
-            }
-        }
-    }
-
    /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records.
    ///
    async fn ingest_xact_record(
@@ -1288,8 +1228,6 @@ impl WalIngest {
        let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
        let mut page_xids: Vec<TransactionId> = vec![parsed.xid];

-        self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time);
-
        for subxact in &parsed.subxacts {
            let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
            if subxact_pageno != pageno {
@@ -2365,9 +2303,6 @@ mod tests {
        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();

        let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap();
-        let span = harness
-            .span()
-            .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID));
        let (tenant, ctx) = harness.load().await;

        let remote_initdb_path =
@@ -2419,7 +2354,6 @@ mod tests {
            while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
-                    .instrument(span.clone())
                    .await
                    .unwrap();
            }
--- a/patches/pg_hint_plan.patch
+++ b/patches/pg_hint_plan.patch
@@ -1,7 +1,13 @@
-diff --git a/expected/ut-A.out b/expected/ut-A.out
-index da723b8..5328114 100644
--- a/expected/ut-A.out
-+++ b/expected/ut-A.out
+commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Thu Jun 6 08:02:42 2024 +0000
+
+    Patch expected files to consider Neon's log messages
+
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
+index da723b8..f8d0102 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
@@ -9,13 +9,16 @@ SET search_path TO public;
 ----
 -- No.A-1-1-3
@@ -19,18 +25,10 @@ index da723b8..5328114 100644
 DROP SCHEMA other_schema;
 ----
 ---- No. A-5-1 comment pattern
-@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
-   FROM public.pg_stat_statements s
-   JOIN pg_catalog.pg_database d
-     ON (s.dbid = d.oid)
-+  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
-  ORDER BY 1;
-                 query                 | calls 
- --------------------------------------+-------
-diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
 index d372459..6282afe 100644
--- a/expected/ut-fdw.out
-+++ b/expected/ut-fdw.out
+--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
 SET client_min_messages TO LOG;
 SET pg_hint_plan.enable_hint TO on;
@@ -39,15 +37,3 @@ index d372459..6282afe 100644
 CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
 CREATE USER MAPPING FOR PUBLIC SERVER file_server;
 CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
-diff --git a/sql/ut-A.sql b/sql/ut-A.sql
-index 7c7d58a..4fd1a07 100644
--- a/sql/ut-A.sql
-+++ b/sql/ut-A.sql
-@@ -963,6 +963,7 @@ SELECT s.query, s.calls
-   FROM public.pg_stat_statements s
-   JOIN pg_catalog.pg_database d
-     ON (s.dbid = d.oid)
-+  WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
-  ORDER BY 1;
- 
- ----
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -550,6 +550,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		case 2:
 			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
+		case 1:
+			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
 		}
@@ -1060,7 +1063,7 @@ pg_init_libpagestore(void)
 							NULL,
 							&neon_protocol_version,
 							2, /* use protocol version 2 */
-							2, /* min */
+							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
 							0,	/* no flags required */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -87,8 +87,9 @@ typedef enum {
 * can skip traversing through recent layers which we know to not contain any
 * versions for the requested page.
 *
- * These structs describe the V2 of these requests. (The old now-defunct V1
- * protocol contained just one LSN and a boolean 'latest' flag.)
+ * These structs describe the V2 of these requests. The old V1 protocol contained
+ * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
+ * set to 1, we will convert these to the V1 requests before sending.
 */
 typedef struct
 {
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1001,10 +1001,51 @@ nm_pack_request(NeonRequest *msg)

 	initStringInfo(&s);

-	pq_sendbyte(&s, msg->tag);
-	pq_sendint64(&s, msg->lsn);
-	pq_sendint64(&s, msg->not_modified_since);
+	if (neon_protocol_version >= 2)
+	{
+		pq_sendbyte(&s, msg->tag);
+		pq_sendint64(&s, msg->lsn);
+		pq_sendint64(&s, msg->not_modified_since);
+	}
+	else
+	{
+		bool		latest;
+		XLogRecPtr	lsn;

+		/*
+		 * In primary, we always request the latest page version.
+		 */
+		if (!RecoveryInProgress())
+		{
+			latest = true;
+			lsn = msg->not_modified_since;
+		}
+		else
+		{
+			/*
+			 * In the protocol V1, we cannot represent that we want to read
+			 * page at LSN X, and we know that it hasn't been modified since
+			 * Y. We can either use 'not_modified_lsn' as the request LSN, and
+			 * risk getting an error if that LSN is too old and has already
+			 * fallen out of the pageserver's GC horizon, or we can send
+			 * 'request_lsn', causing the pageserver to possibly wait for the
+			 * recent WAL to arrive unnecessarily. Or something in between. We
+			 * choose to use the old LSN and risk GC errors, because that's
+			 * what we've done historically.
+			 */
+			latest = false;
+			lsn = msg->not_modified_since;
+		}
+
+		pq_sendbyte(&s, msg->tag);
+		pq_sendbyte(&s, latest);
+		pq_sendint64(&s, lsn);
+	}
+
+	/*
+	 * The rest of the request messages are the same between protocol V1 and
+	 * V2
+	 */
 	switch (messageTag(msg))
 	{
 			/* pagestore_client -> pagestore */
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -110,8 +110,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)

 		tag.rinfo = rinfo;
 		tag.forknum = forknum;
-		/* We need exclusive lock here because of LRU list manipulation */
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		LWLockAcquire(relsize_lock, LW_SHARED);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
 		if (entry != NULL)
 		{
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -220,64 +220,6 @@ nwp_register_gucs(void)
 							NULL, NULL, NULL);
 }

-
-static int
-split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
-{
-	int n_safekeepers = 0;
-	char *curr_sk = safekeepers_list;
-
-	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
-	{
-		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
-			wpg_log(FATAL, "too many safekeepers");
-		}
-
-		coma = strchr(coma, ',');
-		safekeepers[n_safekeepers-1] = curr_sk;
-
-		if (coma != NULL) {
-			*coma++ = '\0';
-		}
-	}
-
-	return n_safekeepers;
-}
-
-/*
- * Accept two coma-separated strings with list of safekeeper host:port addresses.
- * Split them into arrays and return false if two sets do not match, ignoring the order.
- */
-static bool
-safekeepers_cmp(char *old, char *new)
-{
-	char *safekeepers_old[MAX_SAFEKEEPERS];
-	char *safekeepers_new[MAX_SAFEKEEPERS];
-	int len_old = 0;
-	int len_new = 0;
-
-	len_old = split_safekeepers_list(old, safekeepers_old);
-	len_new = split_safekeepers_list(new, safekeepers_new);
-
-	if (len_old != len_new)
-	{
-		return false;
-	}
-
-	qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
-	qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
-
-	for (int i = 0; i < len_new; i++)
-	{
-		if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
-		{
-			return false;
-		}
-	}
-
-	return true;
-}
-
 /*
 * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
 * the list changed.
@@ -293,26 +235,19 @@ assign_neon_safekeepers(const char *newval, void *extra)
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}

-	/* Copy values because we will modify them in split_safekeepers_list() */
-	char *newval_copy = pstrdup(newval);
-	char *oldval = pstrdup(wal_acceptors_list);
-
 	/* 
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
 	 * next bgw start. We should refactor walproposer to allow graceful exit and
 	 * thus remove this delay.
-	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
 	 */
-	if (!safekeepers_cmp(oldval, newval_copy))
+	if (strcmp(wal_acceptors_list, newval) != 0)
 	{
 		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
 				wal_acceptors_list, newval);
 	}
-	pfree(newval_copy);
-	pfree(oldval);
 }

-/* Check if we need to suspend inserts because of lagging replication. */
+/*  Check if we need to suspend inserts because of lagging replication. */
 static uint64
 backpressure_lag_impl(void)
 {
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -2,7 +2,6 @@

 import argparse
 import enum
-import os
 import subprocess
 import sys
 from typing import List
@@ -94,7 +93,7 @@ if __name__ == "__main__":
        "--no-color",
        action="store_true",
        help="disable colored output",
-        default=not sys.stdout.isatty() or os.getenv("TERM") == "dumb",
+        default=not sys.stdout.isatty(),
    )
    args = parser.parse_args()

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -114,6 +114,9 @@ rsa = "0.9"

 workspace_hack.workspace = true

+[target.'cfg(target_os = "linux")'.dependencies]
+ktls = "6"
+
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
  new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
 * postgres
  uses postgres to select auth secrets of existing roles. Useful for local testing
-* web (or link)
+* link
  sends login link for all usernames

 Also proxy can expose following services to the external world:
@@ -36,7 +36,7 @@ To play with it locally one may start proxy over a local postgres installation
 ```

 If both postgres and proxy are running you may send a SQL query:
-```console
+```json
 curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
  -H 'Content-Type: application/json' \
@@ -44,8 +44,7 @@ curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
  }' | jq
-```
-```json
+
 {
  "command": "SELECT",
  "fields": [
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,20 +1,20 @@
 //! Client authentication mechanisms.

 pub mod backend;
-pub use backend::Backend;
+pub use backend::BackendType;

 mod credentials;
-pub(crate) use credentials::{
+pub use credentials::{
    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
    ComputeUserInfoParseError, IpPattern,
 };

 mod password_hack;
-pub(crate) use password_hack::parse_endpoint_param;
+pub use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;

 mod flow;
-pub(crate) use flow::*;
+pub use flow::*;
 use tokio::time::error::Elapsed;

 use crate::{
@@ -25,13 +25,13 @@ use std::{io, net::IpAddr};
 use thiserror::Error;

 /// Convenience wrapper for the authentication error.
-pub(crate) type Result<T> = std::result::Result<T, AuthError>;
+pub type Result<T> = std::result::Result<T, AuthError>;

 /// Common authentication error.
 #[derive(Debug, Error)]
-pub(crate) enum AuthErrorImpl {
+pub enum AuthErrorImpl {
    #[error(transparent)]
-    Web(#[from] backend::WebAuthError),
+    Link(#[from] backend::LinkAuthError),

    #[error(transparent)]
    GetAuthInfo(#[from] console::errors::GetAuthInfoError),
@@ -77,30 +77,30 @@ pub(crate) enum AuthErrorImpl {

 #[derive(Debug, Error)]
 #[error(transparent)]
-pub(crate) struct AuthError(Box<AuthErrorImpl>);
+pub struct AuthError(Box<AuthErrorImpl>);

 impl AuthError {
-    pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
+    pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
        AuthErrorImpl::BadAuthMethod(name.into()).into()
    }

-    pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
+    pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
        AuthErrorImpl::AuthFailed(user.into()).into()
    }

-    pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
+    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
        AuthErrorImpl::IpAddressNotAllowed(ip).into()
    }

-    pub(crate) fn too_many_connections() -> Self {
+    pub fn too_many_connections() -> Self {
        AuthErrorImpl::TooManyConnections.into()
    }

-    pub(crate) fn is_auth_failed(&self) -> bool {
+    pub fn is_auth_failed(&self) -> bool {
        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
    }

-    pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
+    pub fn user_timeout(elapsed: Elapsed) -> Self {
        AuthErrorImpl::UserTimeout(elapsed).into()
    }
 }
@@ -114,7 +114,7 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
 impl UserFacingError for AuthError {
    fn to_string_client(&self) -> String {
        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.to_string_client(),
+            AuthErrorImpl::Link(e) => e.to_string_client(),
            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
            AuthErrorImpl::Sasl(e) => e.to_string_client(),
            AuthErrorImpl::AuthFailed(_) => self.to_string(),
@@ -132,7 +132,7 @@ impl UserFacingError for AuthError {
 impl ReportableError for AuthError {
    fn get_error_kind(&self) -> crate::error::ErrorKind {
        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.get_error_kind(),
+            AuthErrorImpl::Link(e) => e.get_error_kind(),
            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	2cca1b3e4e	fix	2024-08-21 18:44:57 +01:00
Conrad Ludgate	471b3b300d	fix pin	2024-08-21 16:29:52 +01:00
Conrad Ludgate	fbd4b91169	asyncreadready	2024-08-21 16:16:49 +01:00
Conrad Ludgate	8cc45ad9bd	asrawfd things	2024-08-21 15:28:25 +01:00
Conrad Ludgate	aabbd55187	add ktls handling	2024-08-21 14:42:41 +01:00
Conrad Ludgate	987a859352	start integrating ktls	2024-08-21 14:11:58 +01:00
Conrad Ludgate	e171fd805b	add ktls dep	2024-08-21 13:51:02 +01:00
Conrad Ludgate	1e4702b26a	update rustls	2024-08-21 13:47:19 +01:00