fixup: incorrect assumption on img layer need

fixup: serialize again in image layer
fixup: order of waiters in delta layer
2026-05-17 05:00:38 +00:00 · 2024-09-12 23:04:58 +01:00 · 2024-09-12 23:04:10 +01:00 · 2024-09-12 23:04:01 +01:00 · 2024-09-12 20:00:06 +01:00 · 2024-09-12 19:25:46 +01:00
190 changed files with 3132 additions and 5233 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -62,7 +62,7 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
+          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done
@@ -83,10 +83,6 @@ jobs:
        id: pg_v16_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT

-      - name: Set pg 17 revision for caching
-        id: pg_v17_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
-
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -140,13 +136,6 @@ jobs:
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

-      - name: Cache postgres v17 build
-        id: cache_pg_17
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
-
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -159,10 +148,6 @@ jobs:
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
        run: mold -run make postgres-v16 -j$(nproc)

-      - name: Build postgres v17
-        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make postgres-v17 -j$(nproc)
-
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

@@ -225,7 +210,7 @@ jobs:
        run: |
          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
          export PQ_LIB_DIR
-          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
          export LD_LIBRARY_PATH

          #nextest does not yet support running doctests
--- a/.github/workflows/_push-to-acr.yml
+++ b/.github/workflows/_push-to-acr.yml
@@ -52,5 +52,5 @@ jobs:
          for image in ${images}; do
            docker buildx imagetools create \
              -t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
-                                                        neondatabase/${image}:${{ inputs.image_tag }}
+                                        neondatabase/${image}:${{ inputs.image_tag }}
          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -54,8 +54,8 @@ jobs:
      build-tag: ${{steps.build-tag.outputs.tag}}

    steps:
-      # Need `fetch-depth: 0` to count the number of commits in the branch
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -211,7 +211,7 @@ jobs:
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
+      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -357,7 +357,6 @@ jobs:
            })

  coverage-report:
-    if: ${{ !startsWith(github.ref_name, 'release') }}
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
    runs-on: [ self-hosted, small ]
    container:
@@ -374,8 +373,8 @@ jobs:
        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
-      # Need `fetch-depth: 0` for differential coverage (to get diff between two commits)
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0
@@ -476,9 +475,11 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
@@ -547,15 +548,17 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version: [ v14, v15, v16 ]
        arch: [ x64, arm64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          submodules: true
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/setup-buildx-action@v3
@@ -624,7 +627,7 @@ jobs:

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v17'
+        if: matrix.version == 'v16'
        uses: docker/build-push-action@v6
        with:
          target: compute-tools-image
@@ -646,7 +649,7 @@ jobs:

    strategy:
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version: [ v14, v15, v16 ]

    steps:
      - uses: docker/login-action@v3
@@ -668,7 +671,7 @@ jobs:
                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64

      - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v17'
+        if: matrix.version == 'v16'
        run: |
          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
@@ -686,7 +689,7 @@ jobs:
                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v17'
+        if: matrix.version == 'v16'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -697,12 +700,15 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version: [ v14, v15, v16 ]
    env:
      VM_BUILDER_VERSION: v0.29.3

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Downloading vm-builder
        run: |
@@ -742,7 +748,10 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}

    steps:
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - uses: ./.github/actions/set-docker-config-dir
      - uses: docker/login-action@v3
@@ -789,7 +798,7 @@ jobs:
    runs-on: ubuntu-22.04

    env:
-      VERSIONS: v14 v15 v16 v17
+      VERSIONS: v14 v15 v16

    steps:
      - uses: docker/login-action@v3
@@ -830,7 +839,7 @@ jobs:
            done
          done
          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                              neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
+                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

      - name: Login to prod ECR
        uses: docker/login-action@v3
@@ -843,7 +852,7 @@ jobs:
      - name: Copy all images to prod ECR
        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done
@@ -855,7 +864,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -867,7 +876,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 compute-node-v14 compute-node-v15 compute-node-v16
      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -948,7 +957,6 @@ jobs:

  deploy:
    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
-    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()

    runs-on: [ self-hosted, small ]
@@ -963,12 +971,15 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
+          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done

-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Trigger deploy workflow
        env:
@@ -1047,8 +1058,7 @@ jobs:
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
-    # `!failure() && !cancelled()` is required because the workflow transitively depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: github.ref_name == 'release' && !failure() && !cancelled()
+    if: github.ref_name == 'release'

    runs-on: ubuntu-22.04
    steps:
@@ -1107,7 +1117,6 @@ jobs:

              files_to_promote+=("s3://${BUCKET}/${s3_key}")

-              # TODO Add v17
              for pg_version in v14 v15 v16; do
                # We run less tests for debug builds, so we don't need to promote them
                if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -72,10 +72,6 @@ jobs:
        id: pg_v16_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT

-      - name: Set pg 17 revision for caching
-        id: pg_v17_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
-
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v4
@@ -97,13 +93,6 @@ jobs:
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

-      - name: Cache postgres v17 build
-        id: cache_pg_17
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -131,10 +120,6 @@ jobs:
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

-      - name: Build postgres v17
-        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: make postgres-v17 -j$(sysctl -n hw.ncpu)
-
      - name: Build neon extensions
        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)

@@ -181,7 +166,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)

      - name: Upload the build stats
        id: upload-stats
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -34,8 +34,8 @@ jobs:
      build-tag: ${{ steps.build-tag.outputs.tag }}

    steps:
-      # Need `fetch-depth: 0` to count the number of commits in the branch
-      - uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

--- a/.gitmodules
+++ b/.gitmodules
@@ -10,7 +10,3 @@
 	path = vendor/postgres-v16
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_16_STABLE_neon
-[submodule "vendor/postgres-v17"]
-	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
-	branch = REL_17_STABLE_neon
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1189,9 +1189,9 @@ dependencies = [

 [[package]]
 name = "comfy-table"
-version = "7.1.1"
+version = "6.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
+checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d"
 dependencies = [
 "crossterm",
 "strum",
@@ -1209,6 +1209,7 @@ dependencies = [
 "remote_storage",
 "serde",
 "serde_json",
+ "serde_with",
 "utils",
 ]

@@ -1217,6 +1218,7 @@ name = "compute_tools"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "bytes",
 "cfg-if",
 "chrono",
@@ -1235,6 +1237,7 @@ dependencies = [
 "reqwest 0.12.4",
 "rlimit",
 "rust-ini",
+ "serde",
 "serde_json",
 "signal-hook",
 "tar",
@@ -1243,6 +1246,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1313,9 +1317,12 @@ dependencies = [
 name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
 "chrono",
 "rand 0.8.5",
 "serde",
+ "serde_with",
+ "utils",
 ]

 [[package]]
@@ -1327,7 +1334,9 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "futures",
 "git-version",
+ "hex",
 "humantime",
 "humantime-serde",
 "hyper 0.14.26",
@@ -1335,6 +1344,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "pageserver_client",
+ "postgres",
 "postgres_backend",
 "postgres_connection",
 "regex",
@@ -1343,13 +1353,15 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "serde_with",
 "storage_broker",
+ "tar",
 "thiserror",
 "tokio",
 "tokio-postgres",
 "tokio-util",
- "toml",
- "toml_edit",
+ "toml 0.7.4",
+ "toml_edit 0.19.10",
 "tracing",
 "url",
 "utils",
@@ -1473,22 +1485,25 @@ checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"

 [[package]]
 name = "crossterm"
-version = "0.27.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
+checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 1.3.2",
 "crossterm_winapi",
 "libc",
+ "mio",
 "parking_lot 0.12.1",
+ "signal-hook",
+ "signal-hook-mio",
 "winapi",
 ]

 [[package]]
 name = "crossterm_winapi"
-version = "0.9.1"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
+checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
 dependencies = [
 "winapi",
 ]
@@ -1651,6 +1666,7 @@ dependencies = [
 "hex",
 "parking_lot 0.12.1",
 "rand 0.8.5",
+ "scopeguard",
 "smallvec",
 "tracing",
 "utils",
@@ -2220,22 +2236,24 @@ checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"

 [[package]]
 name = "git-version"
-version = "0.3.9"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ad568aa3db0fcbc81f2f116137f263d7304f512a1209b35b85150d3ef88ad19"
+checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
 dependencies = [
 "git-version-macro",
+ "proc-macro-hack",
 ]

 [[package]]
 name = "git-version-macro"
-version = "0.3.9"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
+checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
 dependencies = [
+ "proc-macro-hack",
 "proc-macro2",
 "quote",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -2729,6 +2747,19 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "inotify"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
+dependencies = [
+ "bitflags 1.3.2",
+ "futures-core",
+ "inotify-sys",
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "inotify-sys"
 version = "0.1.5"
@@ -3113,7 +3144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd01039851e82f8799046eabbb354056283fb265c8ec0996af940f4e85a380ff"
 dependencies = [
 "serde",
- "toml",
+ "toml 0.8.14",
 ]

 [[package]]
@@ -3223,7 +3254,7 @@ dependencies = [
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
- "inotify",
+ "inotify 0.9.6",
 "kqueue",
 "libc",
 "log",
@@ -3614,6 +3645,7 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "bytes",
 "camino",
 "clap",
 "git-version",
@@ -3622,12 +3654,13 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "remote_storage",
+ "serde",
 "serde_json",
 "svg_fmt",
 "thiserror",
 "tokio",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "utils",
 "workspace_hack",
 ]
@@ -3640,6 +3673,7 @@ dependencies = [
 "arc-swap",
 "async-compression",
 "async-stream",
+ "async-trait",
 "bit_field",
 "byteorder",
 "bytes",
@@ -3647,13 +3681,16 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
+ "const_format",
 "consumption_metrics",
 "crc32c",
 "criterion",
+ "crossbeam-utils",
 "either",
 "enum-map",
 "enumset",
 "fail",
+ "flate2",
 "futures",
 "git-version",
 "hex",
@@ -3692,9 +3729,13 @@ dependencies = [
 "serde_json",
 "serde_path_to_error",
 "serde_with",
+ "signal-hook",
+ "smallvec",
 "storage_broker",
 "strum",
 "strum_macros",
+ "svg_fmt",
+ "sync_wrapper",
 "sysinfo",
 "tenant_size_model",
 "thiserror",
@@ -3706,8 +3747,9 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
+ "twox-hash",
 "url",
 "utils",
 "walkdir",
@@ -3771,22 +3813,44 @@ name = "pageserver_compaction"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "async-stream",
+ "byteorder",
+ "bytes",
+ "chrono",
 "clap",
+ "const_format",
+ "consumption_metrics",
 "criterion",
+ "crossbeam-utils",
+ "either",
+ "fail",
+ "flate2",
 "futures",
 "git-version",
+ "hex",
 "hex-literal",
+ "humantime",
+ "humantime-serde",
 "itertools 0.10.5",
+ "metrics",
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
 "rand 0.8.5",
+ "smallvec",
 "svg_fmt",
+ "sync_wrapper",
+ "thiserror",
 "tokio",
+ "tokio-io-timeout",
+ "tokio-util",
 "tracing",
+ "tracing-error",
 "tracing-subscriber",
+ "url",
 "utils",
+ "walkdir",
 "workspace_hack",
 ]

@@ -3846,9 +3910,8 @@ dependencies = [

 [[package]]
 name = "parquet"
-version = "53.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "ahash",
 "bytes",
@@ -3867,9 +3930,8 @@ dependencies = [

 [[package]]
 name = "parquet_derive"
-version = "53.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86e9fcfae007533a06b580429a3f7e07cb833ec8aa37c041c16563e7918f057e"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "parquet",
 "proc-macro2",
@@ -4059,7 +4121,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4072,7 +4134,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4091,7 +4153,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4103,7 +4165,9 @@ name = "postgres_backend"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bytes",
+ "futures",
 "once_cell",
 "pq_proto",
 "rustls 0.22.4",
@@ -4136,13 +4200,16 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bindgen",
+ "byteorder",
 "bytes",
 "crc32c",
 "env_logger",
+ "hex",
 "log",
 "memoffset 0.8.0",
 "once_cell",
 "postgres",
+ "rand 0.8.5",
 "regex",
 "serde",
 "thiserror",
@@ -4177,11 +4244,13 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
+ "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
 "serde",
 "thiserror",
 "tokio",
+ "tracing",
 ]

 [[package]]
@@ -4213,6 +4282,12 @@ dependencies = [
 "elliptic-curve 0.13.8",
 ]

+[[package]]
+name = "proc-macro-hack"
+version = "0.5.20+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.78"
@@ -4331,6 +4406,7 @@ dependencies = [
 "aws-config",
 "aws-sdk-iam",
 "aws-sigv4",
+ "aws-types",
 "base64 0.13.1",
 "bstr",
 "bytes",
@@ -4339,6 +4415,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "crossbeam-deque",
 "dashmap",
 "ecdsa 0.16.9",
 "env_logger",
@@ -4364,9 +4441,11 @@ dependencies = [
 "jose-jwa",
 "jose-jwk",
 "lasso",
+ "md5",
 "measured",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "p256 0.13.2",
 "parking_lot 0.12.1",
 "parquet",
@@ -4387,6 +4466,7 @@ dependencies = [
 "reqwest-middleware",
 "reqwest-retry",
 "reqwest-tracing",
+ "routerify",
 "rsa",
 "rstest",
 "rustc-hash",
@@ -4402,6 +4482,7 @@ dependencies = [
 "smol_str",
 "socket2 0.5.5",
 "subtle",
+ "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
@@ -4411,6 +4492,7 @@ dependencies = [
 "tokio-rustls 0.25.0",
 "tokio-tungstenite",
 "tokio-util",
+ "tower-service",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -4700,6 +4782,7 @@ dependencies = [
 "async-stream",
 "async-trait",
 "aws-config",
+ "aws-credential-types",
 "aws-sdk-s3",
 "aws-smithy-async",
 "aws-smithy-types",
@@ -4713,6 +4796,7 @@ dependencies = [
 "futures",
 "futures-util",
 "http-types",
+ "humantime",
 "humantime-serde",
 "hyper 0.14.26",
 "itertools 0.10.5",
@@ -4728,7 +4812,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "utils",
 ]
@@ -5192,12 +5276,14 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
+ "async-trait",
 "byteorder",
 "bytes",
 "camino",
 "camino-tempfile",
 "chrono",
 "clap",
+ "const_format",
 "crc32c",
 "desim",
 "fail",
@@ -5223,7 +5309,9 @@ dependencies = [
 "sd-notify",
 "serde",
 "serde_json",
+ "serde_with",
 "sha2",
+ "signal-hook",
 "storage_broker",
 "strum",
 "strum_macros",
@@ -5234,6 +5322,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-subscriber",
 "url",
@@ -5248,6 +5337,7 @@ version = "0.1.0"
 dependencies = [
 "const_format",
 "serde",
+ "serde_with",
 "utils",
 ]

@@ -5641,6 +5731,17 @@ dependencies = [
 "signal-hook-registry",
 ]

+[[package]]
+name = "signal-hook-mio"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.1"
@@ -5776,6 +5877,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
+ "bytes",
 "clap",
 "const_format",
 "futures",
@@ -5789,6 +5891,7 @@ dependencies = [
 "parking_lot 0.12.1",
 "prost",
 "tokio",
+ "tokio-stream",
 "tonic",
 "tonic-build",
 "tracing",
@@ -5801,7 +5904,9 @@ name = "storage_controller"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "aws-config",
 "bytes",
+ "camino",
 "chrono",
 "clap",
 "control_plane",
@@ -5842,9 +5947,20 @@ dependencies = [
 name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
+ "futures",
+ "pageserver_api",
 "pageserver_client",
+ "postgres",
 "reqwest 0.12.4",
 "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
+ "utils",
 "workspace_hack",
 ]

@@ -5856,9 +5972,13 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
 "camino",
 "chrono",
 "clap",
+ "crc32c",
 "either",
 "futures",
 "futures-util",
@@ -5870,16 +5990,20 @@ dependencies = [
 "pageserver",
 "pageserver_api",
 "postgres_ffi",
+ "rand 0.8.5",
 "remote_storage",
 "reqwest 0.12.4",
 "rustls 0.22.4",
 "rustls-native-certs 0.7.0",
 "serde",
 "serde_json",
+ "serde_with",
 "storage_controller_client",
+ "thiserror",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -5898,11 +6022,14 @@ dependencies = [
 "comfy-table",
 "futures",
 "humantime",
+ "hyper 0.14.26",
 "pageserver_api",
 "pageserver_client",
 "reqwest 0.12.4",
+ "serde",
 "serde_json",
 "storage_controller_client",
+ "thiserror",
 "tokio",
 "tracing",
 "utils",
@@ -5927,21 +6054,21 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"

 [[package]]
 name = "strum"
-version = "0.26.3"
+version = "0.24.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f"

 [[package]]
 name = "strum_macros"
-version = "0.26.4"
+version = "0.24.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59"
 dependencies = [
- "heck 0.5.0",
+ "heck 0.4.1",
 "proc-macro2",
 "quote",
 "rustversion",
- "syn 2.0.52",
+ "syn 1.0.109",
 ]

 [[package]]
@@ -6025,6 +6152,15 @@ dependencies = [
 "xattr",
 ]

+[[package]]
+name = "task-local-extensions"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8"
+dependencies = [
+ "pin-utils",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -6273,7 +6409,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6384,6 +6520,18 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "toml"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
+dependencies = [
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "toml_edit 0.19.10",
+]
+
 [[package]]
 name = "toml"
 version = "0.8.14"
@@ -6393,7 +6541,7 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "toml_edit",
+ "toml_edit 0.22.14",
 ]

 [[package]]
@@ -6405,6 +6553,19 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "toml_edit"
+version = "0.19.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
+dependencies = [
+ "indexmap 1.9.3",
+ "serde",
+ "serde_spanned",
+ "toml_datetime",
+ "winnow 0.4.6",
+]
+
 [[package]]
 name = "toml_edit"
 version = "0.22.14"
@@ -6415,7 +6576,7 @@ dependencies = [
 "serde",
 "serde_spanned",
 "toml_datetime",
- "winnow",
+ "winnow 0.6.13",
 ]

 [[package]]
@@ -6615,6 +6776,7 @@ dependencies = [
 "opentelemetry",
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
+ "reqwest 0.12.4",
 "tokio",
 "tracing",
 "tracing-opentelemetry",
@@ -6818,6 +6980,7 @@ dependencies = [
 "serde_assert",
 "serde_json",
 "serde_path_to_error",
+ "serde_with",
 "signal-hook",
 "strum",
 "strum_macros",
@@ -6826,7 +6989,7 @@ dependencies = [
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
- "toml_edit",
+ "toml_edit 0.19.10",
 "tracing",
 "tracing-error",
 "tracing-subscriber",
@@ -6873,11 +7036,13 @@ dependencies = [
 "cgroups-rs",
 "clap",
 "futures",
+ "inotify 0.10.2",
 "serde",
 "serde_json",
 "sysinfo",
 "tokio",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -6904,6 +7069,7 @@ dependencies = [
 "clap",
 "env_logger",
 "log",
+ "once_cell",
 "postgres",
 "postgres_ffi",
 "regex",
@@ -7369,6 +7535,15 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"

+[[package]]
+name = "winnow"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "winnow"
 version = "0.6.13"
@@ -7426,7 +7601,6 @@ dependencies = [
 "digest",
 "either",
 "fail",
- "futures",
 "futures-channel",
 "futures-executor",
 "futures-io",
@@ -7477,13 +7651,10 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.24.0",
 "tokio-util",
- "toml_edit",
 "tonic",
 "tower",
 "tracing",
 "tracing-core",
- "tracing-log",
- "tracing-subscriber",
 "url",
 "uuid",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -73,7 +73,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "7.1"
+comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-deque = "0.8.5"
@@ -123,8 +123,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "53", default-features = false, features = ["zstd"] }
-parquet_derive = "53"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
@@ -158,8 +158,8 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.26"
-strum_macros = "0.26"
+strum = "0.24"
+strum_macros = "0.24"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
@@ -177,8 +177,8 @@ tokio-rustls = "0.25"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
-toml = "0.8"
-toml_edit = "0.22"
+toml = "0.7"
+toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
@@ -201,21 +201,10 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-
-# We want to use the 'neon' branch for these, but there's currently one
-# incompatible change on the branch. See:
-#
-# - PR #8076 which contained changes that depended on the new changes in
-#   the rust-postgres crate, and
-# - PR #8654 which reverted those changes and made the code in proxy incompatible
-#   with the tip of the 'neon' branch again.
-#
-# When those proxy changes are re-applied (see PR #8747), we can switch using
-# the tip of the 'neon' branch again.
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -252,7 +241,11 @@ tonic-build = "0.9"
 [patch.crates-io]

 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+
+# bug fixes for UUID
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }

 ################# Binary contents sections

--- a/12
+++ b/12
@@ -5,8 +5,6 @@
 ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
-ARG DEFAULT_PG_VERSION=17
-ARG STABLE_PG_VERSION=16

 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -15,7 +13,6 @@ WORKDIR /home/nonroot
 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
 COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
-COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -31,19 +28,16 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
-ARG STABLE_PG_VERSION

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
-COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
 COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -58,7 +52,6 @@ RUN set -e \
 # Build final image
 #
 FROM debian:bullseye-slim
-ARG DEFAULT_PG_VERSION
 WORKDIR /data

 RUN set -e \
@@ -84,7 +77,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubbe
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
 COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
-COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
@@ -101,7 +93,7 @@ RUN mkdir -p /data/.neon/ && \

 # When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
 # that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
+ENV LD_LIBRARY_PATH=/usr/local/v16/lib


 VOLUME ["/data"]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -55,27 +55,22 @@ RUN cd postgres && \
    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
    # so we do it here.
+    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
+    # the first loop is for pg_stat_statement extension version <= 1.6
    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
        filename=$(basename "$file"); \
-        # Note that there are no downgrade scripts for pg_stat_statements, so we \
-        # don't have to modify any downgrade paths or (much) older versions: we only \
-        # have to make sure every creation of the pg_stat_statements_reset function \
-        # also adds execute permissions to the neon_superuser.
-        case $filename in \
-          pg_stat_statements--1.4.sql) \
-            # pg_stat_statements_reset is first created with 1.4
+        if echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.6--1.7.sql) \
-            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
+        fi; \
+    done; \
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
+    # where pg_stat_statement_reset() got 3 additional arguments
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if ! echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.10--1.11.sql) \
-            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
-            ;; \
-        esac; \
-    done;
+        fi; \
+    done

 #########################################################################################
 #
@@ -84,7 +79,6 @@ RUN cd postgres && \
 #
 #########################################################################################
 FROM build-deps AS postgis-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -93,11 +87,7 @@ RUN apt update && \
    protobuf-c-compiler xsltproc

 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
-RUN case "${PG_VERSION}" in "v17") \
-    mkdir -p /sfcgal && \
-    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
-    esac && \
-    wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -106,10 +96,7 @@ RUN case "${PG_VERSION}" in "v17") \

 ENV PATH="/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in "v17") \
-    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
-    esac && \
-    wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -135,10 +122,7 @@ RUN case "${PG_VERSION}" in "v17") \
    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
+RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
@@ -158,19 +142,12 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS plv8-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt update && \
+RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
    # generate and copy upgrade scripts
@@ -195,13 +172,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS h3-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "$(uname -m)" in \
+RUN case "$(uname -m)" in \
      "x86_64") \
        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
        ;; \
@@ -219,11 +192,7 @@ RUN case "${PG_VERSION}" in "v17") \
      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
      && rm /tmp/cmake-install.sh

-RUN case "${PG_VERSION}" in "v17") \
-        mkdir -p /h3/usr/ && \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
@@ -233,10 +202,7 @@ RUN case "${PG_VERSION}" in "v17") \
    cp -R /h3/usr / && \
    rm -rf build

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
@@ -252,13 +218,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS unit-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -277,7 +239,6 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS vector-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch
@@ -285,10 +246,7 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
@@ -303,14 +261,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pgjwt-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,13 +277,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS hypopg-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -343,13 +293,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-hashids-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
+RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -363,15 +309,11 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS rum-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/rum.patch /rum.patch

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
+RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
    patch -p1 < /rum.patch && \
@@ -386,13 +328,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pgtap-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
+RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -406,13 +344,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS ip4r-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -426,13 +360,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS prefix-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -446,13 +376,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS hll-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -466,13 +392,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS plpgsql-check-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -491,10 +413,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export TIMESCALEDB_VERSION=2.10.1 \
        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
@@ -527,10 +446,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
      "v14") \
        export PG_HINT_PLAN_VERSION=14_1_4_1 \
        export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -543,9 +459,6 @@ RUN case "${PG_VERSION}" in "v17") \
        export PG_HINT_PLAN_VERSION=16_1_6_0 \
        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
        ;; \
-      "v17") \
-        echo "TODO: PG17 pg_hint_plan support" && exit 0 \
-        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -565,14 +478,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-cron-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -586,13 +495,9 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS rdkit-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
    apt-get install -y \
        cmake \
        libboost-iostreams1.74-dev \
@@ -602,10 +507,7 @@ RUN case "${PG_VERSION}" in "v17") \
        libeigen3-dev

 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
@@ -642,14 +544,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-uuidv7-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
+RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -663,14 +561,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-roaringbitmap-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -684,14 +578,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-semver-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -730,14 +620,10 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################
 FROM build-deps AS pg-anon-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && \
-    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
+RUN wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
@@ -755,7 +641,6 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt-get update && \
@@ -766,11 +651,9 @@ ENV HOME=/home/nonroot
 ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
+ARG PG_VERSION

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
@@ -789,10 +672,7 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
    echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
@@ -814,10 +694,7 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
-    esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
    echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -837,10 +714,7 @@ FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION

 # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    # TODO update pgrx version in the pg_tiktoken repo and remove this line
@@ -859,10 +733,7 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in "v17") \
-    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -877,14 +748,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #########################################################################################

 FROM build-deps AS wal2json-pg-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -897,14 +764,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-ivm-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -918,14 +781,10 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 FROM build-deps AS pg-partman-build
-ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_partman doesn't support PG17 yet" && exit 0;; \
-    esac && \
-    wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -995,8 +854,8 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    case "${PG_VERSION}" in \
        "v14" | "v15") \
        ;; \
-        "v16" | "v17") \
-            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
+        "v16") \
+            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
        ;; \
        *) \
            echo "unexpected PostgreSQL version" && exit 1 \
@@ -1040,7 +899,7 @@ FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql

 # Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm -f ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp

 # Remove headers that we won't need anymore - we've completed installation of all extensions
 RUN rm -r /usr/local/pgsql/include
@@ -1059,10 +918,7 @@ RUN rm /usr/local/pgsql/lib/lib*.a

 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    mkdir /ext-src
+RUN mkdir /ext-src

 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
@@ -1100,39 +956,18 @@ COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
 COPY patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/ && for f in *.tar.gz; \
+RUN cd /ext-src/ && for f in *.tar.gz; \
    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 # cmake is required for the h3 test
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt-get update && apt-get install -y cmake
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN apt-get update && apt-get install -y cmake
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_anon.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_cron.patch
+RUN patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
--- a/56
+++ b/56
@@ -119,8 +119,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
-.PHONY: postgres-configure-v17
-postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
 .PHONY: postgres-configure-v16
 postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
@@ -217,31 +215,29 @@ neon-pg-clean-ext-%:
 # they depend on openssl and other libraries that are not included in our
 # Rust build.
 .PHONY: walproposer-lib
-walproposer-lib: neon-pg-ext-v17
+walproposer-lib: neon-pg-ext-v16
 	+@echo "Compiling walproposer-lib"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
 		pg_strong_random.o
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		checksum_helper.o \
-		cryptohash_openssl.o \
+		pg_crc32c.o \
 		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
 		md5_common.o \
-		parse_manifest.o \
-		scram-common.o
-ifeq ($(UNAME_S),Linux)
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		pg_crc32c.o
+		checksum_helper.o
 endif

 .PHONY: walproposer-lib-clean
 walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
 		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean

@@ -249,44 +245,38 @@ walproposer-lib-clean:
 neon-pg-ext: \
 	neon-pg-ext-v14 \
 	neon-pg-ext-v15 \
-	neon-pg-ext-v16 \
-	neon-pg-ext-v17
+	neon-pg-ext-v16

 .PHONY: neon-pg-clean-ext
 neon-pg-clean-ext: \
 	neon-pg-clean-ext-v14 \
 	neon-pg-clean-ext-v15 \
-	neon-pg-clean-ext-v16 \
-	neon-pg-clean-ext-v17
+	neon-pg-clean-ext-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
 	postgres-v15 \
-	postgres-v16 \
-	postgres-v17
+	postgres-v16

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
 	postgres-headers-v15 \
-	postgres-headers-v16 \
-	postgres-headers-v17
+	postgres-headers-v16

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
 	postgres-clean-v15 \
-	postgres-clean-v16 \
-	postgres-clean-v17
+	postgres-clean-v16

 .PHONY: postgres-check
 postgres-check: \
 	postgres-check-v14 \
 	postgres-check-v15 \
-	postgres-check-v16 \
-	postgres-check-v17
+	postgres-check-v16

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
@@ -331,13 +321,13 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	rm -f pg*.BAK

 # Indent pxgn/neon.
-.PHONY: neon-pgindent
-neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
+.PHONY: pgindent
+neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent


--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ testing = []

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
@@ -23,6 +24,7 @@ num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
+serde.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
@@ -41,6 +43,7 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1052,19 +1052,26 @@ impl ComputeNode {
        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::with_compute_ctl_tmp_override(
+                    pgdata_path,
+                    "neon.max_cluster_size=-1",
+                    || {
+                        self.pg_reload_conf()?;
+
+                        self.apply_config(&compute_state)?;
+
+                        Ok(())
+                    },
+                )?;
                self.pg_reload_conf()?;
-
-                self.apply_config(&compute_state)?;
-
-                Ok(())
-            })?;
-            self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -124,7 +124,6 @@ fn parse_pg_version(human_version: &str) -> &str {
            "14" => return "v14",
            "15" => return "v15",
            "16" => return "v16",
-            "17" => return "v17",
            _ => {}
        },
        _ => {}
--- a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -1 +0,0 @@
-GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -793,9 +793,6 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!(
            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
        ),
-        include_str!(
-            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
-        ),
    ];

    MigrationRunner::new(client, &migrations).run_migrations()?;
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,10 +9,13 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
+postgres.workspace = true
+hex.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
@@ -20,6 +23,8 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+serde_with.workspace = true
+tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -151,7 +151,7 @@ where
                    print!(".");
                    io::stdout().flush().unwrap();
                }
-                tokio::time::sleep(RETRY_INTERVAL).await;
+                thread::sleep(RETRY_INTERVAL);
            }
            Err(e) => {
                println!("error starting process {process_name:?}: {e:#}");
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -34,14 +34,12 @@ use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
-use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use std::time::Duration;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
-use tokio::task::JoinSet;
 use url::Host;
 use utils::{
    auth::{Claims, Scope},
@@ -89,35 +87,34 @@ fn main() -> Result<()> {

    // Check for 'neon init' command first.
    let subcommand_result = if sub_name == "init" {
-        handle_init(sub_args).map(|env| Some(Cow::Owned(env)))
+        handle_init(sub_args).map(Some)
    } else {
        // all other commands need an existing config
-
-        let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
+        let mut env =
+            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
        let original_env = env.clone();
-        let env = Box::leak(Box::new(env));
+
        let rt = tokio::runtime::Builder::new_current_thread()
            .enable_all()
            .build()
            .unwrap();

        let subcommand_result = match sub_name {
-            "tenant" => rt.block_on(handle_tenant(sub_args, env)),
-            "timeline" => rt.block_on(handle_timeline(sub_args, env)),
-            "start" => rt.block_on(handle_start_all(env, get_start_timeout(sub_args))),
-            "stop" => rt.block_on(handle_stop_all(sub_args, env)),
-            "pageserver" => rt.block_on(handle_pageserver(sub_args, env)),
-            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, env)),
-            "storage_broker" => rt.block_on(handle_storage_broker(sub_args, env)),
-            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, env)),
-            "endpoint" => rt.block_on(handle_endpoint(sub_args, env)),
-            "mappings" => handle_mappings(sub_args, env),
+            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
+            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
+            "start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
+            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
+            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
+            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
+            "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };

-        if &original_env != env {
-            subcommand_result.map(|()| Some(Cow::Borrowed(env)))
+        if original_env != env {
+            subcommand_result.map(|()| Some(env))
        } else {
            subcommand_result.map(|()| None)
        }
@@ -643,8 +640,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
        }
        Some(("branch", branch_match)) => {
            let tenant_id = get_tenant_id(branch_match, env)?;
-            let new_timeline_id =
-                parse_timeline_id(branch_match)?.unwrap_or(TimelineId::generate());
            let new_branch_name = branch_match
                .get_one::<String>("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;
@@ -663,6 +658,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
+            let new_timeline_id = TimelineId::generate();
            let storage_controller = StorageController::from_env(env);
            let create_req = TimelineCreateRequest {
                new_timeline_id,
@@ -1248,122 +1244,49 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_storage_broker(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match sub_match.subcommand() {
-        Some(broker_command_data) => broker_command_data,
-        None => bail!("no broker subcommand provided"),
-    };
-
-    match sub_name {
-        "start" => {
-            if let Err(e) = broker::start_broker_process(env, get_start_timeout(sub_args)).await {
-                eprintln!("broker start failed: {e}");
-                exit(1);
-            }
-        }
-
-        "stop" => {
-            if let Err(e) = broker::stop_broker_process(env) {
-                eprintln!("broker stop failed: {e}");
-                exit(1);
-            }
-        }
-
-        _ => bail!("Unexpected broker subcommand '{}'", sub_name),
-    }
-    Ok(())
-}
-
 async fn handle_start_all(
-    env: &'static local_env::LocalEnv,
+    env: &local_env::LocalEnv,
    retry_timeout: &Duration,
 ) -> anyhow::Result<()> {
-    let Err(errors) = handle_start_all_impl(env, *retry_timeout).await else {
-        neon_start_status_check(env, retry_timeout)
-            .await
-            .context("status check after successful startup of all services")?;
-        return Ok(());
-    };
-
-    eprintln!("startup failed because one or more services could not be started");
-
-    for e in errors {
-        eprintln!("{e}");
-        let debug_repr = format!("{e:?}");
-        for line in debug_repr.lines() {
-            eprintln!("  {line}");
-        }
-    }
-
-    try_stop_all(env, true).await;
-
-    exit(2);
-}
-
-/// Returns Ok() if and only if all services could be started successfully.
-/// Otherwise, returns the list of errors that occurred during startup.
-async fn handle_start_all_impl(
-    env: &'static local_env::LocalEnv,
-    retry_timeout: Duration,
-) -> Result<(), Vec<anyhow::Error>> {
    // Endpoints are not started automatically

-    let mut js = JoinSet::new();
+    broker::start_broker_process(env, retry_timeout).await?;

-    // force infalliblity through closure
-    #[allow(clippy::redundant_closure_call)]
-    (|| {
-        js.spawn(async move {
-            let retry_timeout = retry_timeout;
-            broker::start_broker_process(env, &retry_timeout).await
-        });
-
-        // Only start the storage controller if the pageserver is configured to need it
-        if env.control_plane_api.is_some() {
-            js.spawn(async move {
-                let storage_controller = StorageController::from_env(env);
-                storage_controller
-                    .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                        retry_timeout.into(),
-                    ))
-                    .await
-                    .map_err(|e| e.context("start storage_controller"))
-            });
-        }
-
-        for ps_conf in &env.pageservers {
-            js.spawn(async move {
-                let pageserver = PageServerNode::from_env(env, ps_conf);
-                pageserver
-                    .start(&retry_timeout)
-                    .await
-                    .map_err(|e| e.context(format!("start pageserver {}", ps_conf.id)))
-            });
-        }
-
-        for node in env.safekeepers.iter() {
-            js.spawn(async move {
-                let safekeeper = SafekeeperNode::from_env(env, node);
-                safekeeper
-                    .start(vec![], &retry_timeout)
-                    .await
-                    .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
-            });
-        }
-    })();
-
-    let mut errors = Vec::new();
-    while let Some(result) = js.join_next().await {
-        let result = result.expect("we don't panic or cancel the tasks");
-        if let Err(e) = result {
-            errors.push(e);
+    // Only start the storage controller if the pageserver is configured to need it
+    if env.control_plane_api.is_some() {
+        let storage_controller = StorageController::from_env(env);
+        if let Err(e) = storage_controller
+            .start(NeonStorageControllerStartArgs::with_default_instance_id(
+                (*retry_timeout).into(),
+            ))
+            .await
+        {
+            eprintln!("storage_controller start failed: {:#}", e);
+            try_stop_all(env, true).await;
+            exit(1);
        }
    }

-    if !errors.is_empty() {
-        return Err(errors);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(retry_timeout).await {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true).await;
+            exit(1);
+        }
    }

+    for node in env.safekeepers.iter() {
+        let safekeeper = SafekeeperNode::from_env(env, node);
+        if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
+            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
+            try_stop_all(env, false).await;
+            exit(1);
+        }
+    }
+
+    neon_start_status_check(env, retry_timeout).await?;
+
    Ok(())
 }

@@ -1647,6 +1570,7 @@ fn cli() -> Command {
                        .value_parser(value_parser!(PathBuf))
                        .value_name("config")
                )
+                .arg(pg_version_arg.clone())
                .arg(force_arg)
        )
        .subcommand(
@@ -1659,7 +1583,6 @@ fn cli() -> Command {
            .subcommand(Command::new("branch")
                .about("Create a new timeline, using another timeline as a base, copying its data")
                .arg(tenant_id_arg.clone())
-                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name")
                    .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false))
@@ -1748,19 +1671,6 @@ fn cli() -> Command {
                            .arg(stop_mode_arg.clone())
                            .arg(instance_id))
        )
-        .subcommand(
-            Command::new("storage_broker")
-                .arg_required_else_help(true)
-                .about("Manage broker")
-                .subcommand(Command::new("start")
-                            .about("Start broker")
-                            .arg(timeout_arg.clone())
-                )
-                .subcommand(Command::new("stop")
-                            .about("Stop broker")
-                            .arg(stop_mode_arg.clone())
-                )
-        )
        .subcommand(
            Command::new("safekeeper")
                .arg_required_else_help(true)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -702,7 +702,7 @@ impl Endpoint {
                    }
                }
            }
-            tokio::time::sleep(ATTEMPT_INTERVAL).await;
+            std::thread::sleep(ATTEMPT_INTERVAL);
        }

        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -342,7 +342,7 @@ impl LocalEnv {

        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -75,14 +75,14 @@ impl PageServerNode {
        }
    }

-    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
-        toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
+    fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
+        toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
    }

    fn pageserver_init_make_toml(
        &self,
        conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::DocumentMut> {
+    ) -> anyhow::Result<toml_edit::Document> {
        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");

        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
@@ -137,9 +137,9 @@ impl PageServerNode {

        // Turn `overrides` into a toml document.
        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::DocumentMut::new();
+        let mut config_toml = toml_edit::Document::new();
        for fragment_str in overrides {
-            let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
+            let fragment = toml_edit::Document::from_str(&fragment_str)
                .expect("all fragments in `overrides` are valid toml documents, this function controls that");
            for (key, item) in fragment.iter() {
                config_toml.insert(key, item.clone());
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -28,7 +28,6 @@ use utils::{
    auth::{encode_from_key_file, Claims, Scope},
    id::{NodeId, TenantId},
 };
-use whoami::username;

 pub struct StorageController {
    env: LocalEnv,
@@ -184,7 +183,7 @@ impl StorageController {
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];

        for v in prefer_versions {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
@@ -212,16 +211,7 @@ impl StorageController {
    /// Readiness check for our postgres process
    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
-        let args = [
-            "-h",
-            "localhost",
-            "-U",
-            &username(),
-            "-d",
-            DB_NAME,
-            "-p",
-            &format!("{}", postgres_port),
-        ];
+        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;

        Ok(exitcode.success())
@@ -235,11 +225,7 @@ impl StorageController {
    ///
    /// Returns the database url
    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
-        let database_url = format!(
-            "postgresql://{}@localhost:{}/{DB_NAME}",
-            &username(),
-            postgres_port
-        );
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let createdb_path = pg_bin_dir.join("createdb");
@@ -249,10 +235,6 @@ impl StorageController {
                "localhost",
                "-p",
                &format!("{}", postgres_port),
-                "-U",
-                &username(),
-                "-O",
-                &username(),
                DB_NAME,
            ])
            .output()
@@ -289,7 +271,7 @@ impl StorageController {
            // But tokio-postgres fork doesn't have this upstream commit:
            // https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
            // => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
-            .user(&username())
+            .user(&whoami::username())
            .dbname(DB_NAME)
            .connect(tokio_postgres::NoTls)
            .await
@@ -346,12 +328,6 @@ impl StorageController {
            let pg_log_path = pg_data_path.join("postgres.log");

            if !tokio::fs::try_exists(&pg_data_path).await? {
-                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
-                tracing::info!(
-                    "Initializing storage controller database with args: {:?}",
-                    initdb_args
-                );
-
                // Initialize empty database
                let initdb_path = pg_bin_dir.join("initdb");
                let mut child = Command::new(&initdb_path)
@@ -359,7 +335,7 @@ impl StorageController {
                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                    ])
-                    .args(initdb_args)
+                    .args(["-D", pg_data_path.as_ref()])
                    .spawn()
                    .expect("Failed to spawn initdb");
                let status = child.wait().await?;
@@ -388,14 +364,8 @@ impl StorageController {
                pg_data_path.as_ref(),
                "-l",
                pg_log_path.as_ref(),
-                "-U",
-                &username(),
                "start",
            ];
-            tracing::info!(
-                "Starting storage controller database with args: {:?}",
-                db_start_args
-            );

            background_process::start_process(
                "storage_controller_db",
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -11,11 +11,14 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
+hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true

--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,6 +5,9 @@ edition = "2021"
 license = "Apache-2.0"

 [dependencies]
+anyhow.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 rand.workspace = true
 serde.workspace = true
+serde_with.workspace = true
+utils.workspace = true
--- a/libs/desim/Cargo.toml
+++ b/libs/desim/Cargo.toml
@@ -12,4 +12,5 @@ bytes.workspace = true
 utils.workspace = true
 parking_lot.workspace = true
 hex.workspace = true
+scopeguard.workspace = true
 smallvec = { workspace = true, features = ["write"] }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,8 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
+use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};

@@ -263,6 +263,15 @@ impl Key {
        field5: u8::MAX,
        field6: u32::MAX,
    };
+    /// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
+    pub const NON_L0_MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX - 1,
+    };

    pub fn from_hex(s: &str) -> Result<Self> {
        if s.len() != 36 {
@@ -350,17 +359,7 @@ impl Key {
 // 02 00000000 00000000 00000000 00   00000000
 //
 // TwoPhaseFile:
-//
-// 02 00000000 00000000 00XXXXXX XX   XXXXXXXX
-//
-//                        \______XID_________/
-//
-// The 64-bit XID is stored a little awkwardly in field6, field5 and
-// field4. PostgreSQL v16 and below only stored a 32-bit XID, which
-// fit completely in field6, but starting with PostgreSQL v17, a full
-// 64-bit XID is used. Most pageserver code that accesses
-// TwoPhaseFiles now deals with 64-bit XIDs even on v16, the high bits
-// are just unused.
+// 02 00000000 00000000 00000000 00   XID
 //
 // ControlFile:
 // 03 00000000 00000000 00000000 00   00000000
@@ -592,36 +591,35 @@ pub const TWOPHASEDIR_KEY: Key = Key {
 };

 #[inline(always)]
-pub fn twophase_file_key(xid: u64) -> Key {
+pub fn twophase_file_key(xid: TransactionId) -> Key {
    Key {
        field1: 0x02,
        field2: 0,
        field3: 0,
-        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
-        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
-        field6: (xid & 0x00000000FFFFFFFF) as u32,
+        field4: 0,
+        field5: 0,
+        field6: xid,
    }
 }

 #[inline(always)]
-pub fn twophase_key_range(xid: u64) -> Range<Key> {
-    // 64-bit XIDs really should not overflow
+pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
    let (next_xid, overflowed) = xid.overflowing_add(1);

    Key {
        field1: 0x02,
        field2: 0,
        field3: 0,
-        field4: ((xid & 0xFFFFFF0000000000) >> 40) as u32,
-        field5: ((xid & 0x000000FF00000000) >> 32) as u8,
-        field6: (xid & 0x00000000FFFFFFFF) as u32,
+        field4: 0,
+        field5: 0,
+        field6: xid,
    }..Key {
        field1: 0x02,
        field2: 0,
-        field3: u32::from(overflowed),
-        field4: ((next_xid & 0xFFFFFF0000000000) >> 40) as u32,
-        field5: ((next_xid & 0x000000FF00000000) >> 32) as u8,
-        field6: (next_xid & 0x00000000FFFFFFFF) as u32,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
    }
 }

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -62,7 +62,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::VariantNames,
+    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -89,19 +89,8 @@ impl PageserverUtilization {

    /// If a node is currently hosting more work than it can comfortably handle.  This does not indicate that
    /// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
-    ///
-    /// When a node is overloaded, we may override soft affinity preferences and do things like scheduling
-    /// into a node in a less desirable AZ, if all the nodes in the preferred AZ are overloaded.
    pub fn is_overloaded(score: RawScore) -> bool {
-        // Why the factor of two?  This is unscientific but reflects behavior of real systems:
-        // - In terms of shard counts, a node's preferred max count is a soft limit intended to keep
-        //   startup and housekeeping jobs nice and responsive.  We can go to double this limit if needed
-        //   until some more nodes are deployed.
-        // - In terms of disk space, the node's utilization heuristic assumes every tenant needs to
-        //   hold its biggest timeline fully on disk, which is tends to be an over estimate when
-        //   some tenants are very idle and have dropped layers from disk.  In practice going up to
-        //   double is generally better than giving up and scheduling in a sub-optimal AZ.
-        score >= 2 * Self::UTILIZATION_FULL
+        score >= Self::UTILIZATION_FULL
    }

    pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -5,8 +5,10 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
+futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -81,16 +81,17 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
    )
 }

+#[async_trait::async_trait]
 pub trait Handler<IO> {
    /// Handle single query.
    /// postgres_backend will issue ReadyForQuery after calling this (this
    /// might be not what we want after CopyData streaming, but currently we don't
    /// care). It will also flush out the output buffer.
-    fn process_query(
+    async fn process_query(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
-    ) -> impl Future<Output = Result<(), QueryError>>;
+    ) -> Result<(), QueryError>;

    /// Called on startup packet receival, allows to process params.
    ///
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -23,6 +23,7 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {

 struct TestHandler {}

+#[async_trait::async_trait]
 impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
    // return single col 'hey' for any query
    async fn process_query(
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -5,10 +5,13 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+rand.workspace = true
 regex.workspace = true
 bytes.workspace = true
+byteorder.workspace = true
 anyhow.workspace = true
 crc32c.workspace = true
+hex.workspace = true
 once_cell.workspace = true
 log.workspace = true
 memoffset.workspace = true
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15", "v16", "v17"] {
+    for pg_version in &["v14", "v15", "v16"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -121,7 +121,6 @@ fn main() -> anyhow::Result<()> {
            .allowlist_type("XLogPageHeaderData")
            .allowlist_type("XLogLongPageHeaderData")
            .allowlist_var("XLOG_PAGE_MAGIC")
-            .allowlist_var("PG_MAJORVERSION_NUM")
            .allowlist_var("PG_CONTROL_FILE_SIZE")
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -44,9 +44,6 @@ macro_rules! postgres_ffi {
            // Re-export some symbols from bindings
            pub use bindings::DBState_DB_SHUTDOWNED;
            pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
-
-            pub const ZERO_CHECKPOINT: bytes::Bytes =
-                bytes::Bytes::from_static(&[0u8; xlog_utils::SIZEOF_CHECKPOINT]);
        }
    };
 }
@@ -57,7 +54,6 @@ macro_rules! for_all_postgres_versions {
        $macro!(v14);
        $macro!(v15);
        $macro!(v16);
-        $macro!(v17);
    };
 }

@@ -92,7 +88,6 @@ macro_rules! dispatch_pgversion {
                14 : v14,
                15 : v15,
                16 : v16,
-                17 : v17,
            ]
        )
    };
@@ -111,110 +106,6 @@ macro_rules! dispatch_pgversion {
    };
 }

-#[macro_export]
-macro_rules! enum_pgversion_dispatch {
-    ($name:expr, $typ:ident, $bind:ident, $code:block) => {
-        enum_pgversion_dispatch!(
-            name = $name,
-            bind = $bind,
-            typ = $typ,
-            code = $code,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-                V17 : v17,
-            ]
-        )
-    };
-    (name = $name:expr,
-     bind = $bind:ident,
-     typ = $typ:ident,
-     code = $code:block,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]) => {
-        match $name {
-            $(
-            self::$typ::$variant($bind) => {
-                use $crate::$md as pgv;
-                $code
-            }
-            ),+,
-        }
-    };
-}
-
-#[macro_export]
-macro_rules! enum_pgversion {
-    {$name:ident, pgv :: $t:ident} => {
-        enum_pgversion!{
-            name = $name,
-            typ = $t,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-                V17 : v17,
-            ]
-        }
-    };
-    {$name:ident, pgv :: $p:ident :: $t:ident} => {
-        enum_pgversion!{
-            name = $name,
-            path = $p,
-            typ = $t,
-            pgversions = [
-                V14 : v14,
-                V15 : v15,
-                V16 : v16,
-                V17 : v17,
-            ]
-        }
-    };
-    {name = $name:ident,
-     typ = $t:ident,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
-        pub enum $name {
-            $($variant ( $crate::$md::$t )),+
-        }
-        impl self::$name {
-            pub fn pg_version(&self) -> u32 {
-                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
-                })
-            }
-        }
-        $(
-        impl Into<self::$name> for $crate::$md::$t {
-            fn into(self) -> self::$name {
-                self::$name::$variant (self)
-            }
-        }
-        )+
-    };
-    {name = $name:ident,
-     path = $p:ident,
-     typ = $t:ident,
-     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
-        pub enum $name {
-            $($variant ($crate::$md::$p::$t)),+
-        }
-        impl $name {
-            pub fn pg_version(&self) -> u32 {
-                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::PG_MAJORVERSION_NUM
-                })
-            }
-        }
-        $(
-        impl Into<$name> for $crate::$md::$p::$t {
-            fn into(self) -> $name {
-                $name::$variant (self)
-            }
-        }
-        )+
-    };
-}
-
 pub mod pg_constants;
 pub mod relfile_utils;

--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -152,9 +152,6 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;

-// From heapam_xlog.h
-pub const XLOG_HEAP2_REWRITE: u8 = 0x00;
-
 // From replication/message.h
 pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;

@@ -222,20 +219,15 @@ pub const INVALID_TRANSACTION_ID: u32 = 0;
 pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
 pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

-/* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
-pub const XLOG_PARAMETER_CHANGE: u8 = 0x60;
-pub const XLOG_END_OF_RECOVERY: u8 = 0x90;
+pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
+pub const XLP_LONG_HEADER: u16 = 0x0002;

 /* From xlog.h */
 pub const XLOG_REPLORIGIN_SET: u8 = 0x00;
 pub const XLOG_REPLORIGIN_DROP: u8 = 0x10;

-/* xlog_internal.h */
-pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
-pub const XLP_LONG_HEADER: u16 = 0x0002;
-
 /* From replication/slot.h */
 pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
   + 64 /* NameData */  + 4*4;
@@ -253,6 +245,33 @@ pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
 /* From origin.c */
 pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;

+// List of subdirectories inside pgdata.
+// Copied from src/bin/initdb/initdb.c
+pub const PGDATA_SUBDIRS: [&str; 22] = [
+    "global",
+    "pg_wal/archive_status",
+    "pg_commit_ts",
+    "pg_dynshmem",
+    "pg_notify",
+    "pg_serial",
+    "pg_snapshots",
+    "pg_subtrans",
+    "pg_twophase",
+    "pg_multixact",
+    "pg_multixact/members",
+    "pg_multixact/offsets",
+    "base",
+    "base/1",
+    "pg_replslot",
+    "pg_tblspc",
+    "pg_stat",
+    "pg_stat_tmp",
+    "pg_xact",
+    "pg_logical",
+    "pg_logical/snapshots",
+    "pg_logical/mappings",
+];
+
 // Don't include postgresql.conf as it is inconvenient on node start:
 // we need postgresql.conf before basebackup to synchronize safekeepers
 // so no point in overwriting it during backup restore. Rest of the files
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -5,33 +5,6 @@ pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */

-// List of subdirectories inside pgdata.
-// Copied from src/bin/initdb/initdb.c
-pub const PGDATA_SUBDIRS: [&str; 22] = [
-    "global",
-    "pg_wal/archive_status",
-    "pg_commit_ts",
-    "pg_dynshmem",
-    "pg_notify",
-    "pg_serial",
-    "pg_snapshots",
-    "pg_subtrans",
-    "pg_twophase",
-    "pg_multixact",
-    "pg_multixact/members",
-    "pg_multixact/offsets",
-    "base",
-    "base/1",
-    "pg_replslot",
-    "pg_tblspc",
-    "pg_stat",
-    "pg_stat_tmp",
-    "pg_xact",
-    "pg_logical",
-    "pg_logical/snapshots",
-    "pg_logical/mappings",
-];
-
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
 }
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -11,8 +11,6 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */

 pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */

-pub use super::super::v14::bindings::PGDATA_SUBDIRS;
-
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;

--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -11,8 +11,6 @@ pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */

 pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */

-pub use super::super::v14::bindings::PGDATA_SUBDIRS;
-
 pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;

--- a/libs/postgres_ffi/src/pg_constants_v17.rs
+++ b/libs/postgres_ffi/src/pg_constants_v17.rs
@@ -1,55 +0,0 @@
-pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
-
-pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
-pub const XLOG_DBASE_DROP: u8 = 0x20;
-
-pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
-pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
-pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
-pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
-
-pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
-
-// List of subdirectories inside pgdata.
-// Copied from src/bin/initdb/initdb.c
-pub const PGDATA_SUBDIRS: [&str; 23] = [
-    "global",
-    "pg_wal/archive_status",
-    "pg_wal/summaries",
-    "pg_commit_ts",
-    "pg_dynshmem",
-    "pg_notify",
-    "pg_serial",
-    "pg_snapshots",
-    "pg_subtrans",
-    "pg_twophase",
-    "pg_multixact",
-    "pg_multixact/members",
-    "pg_multixact/offsets",
-    "base",
-    "base/1",
-    "pg_replslot",
-    "pg_tblspc",
-    "pg_stat",
-    "pg_stat_tmp",
-    "pg_xact",
-    "pg_logical",
-    "pg_logical/snapshots",
-    "pg_logical/mappings",
-];
-
-pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
-    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
-
-    (bimg_info & ANY_COMPRESS_FLAG) != 0
-}
-
-
-pub const XLOG_HEAP2_PRUNE_ON_ACCESS: u8 = 0x10;
-pub const XLOG_HEAP2_PRUNE_VACUUM_SCAN: u8 = 0x20;
-pub const XLOG_HEAP2_PRUNE_VACUUM_CLEANUP: u8 = 0x30;
-
-
-pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0;
-pub const XLOG_CHECKPOINT_REDO: u8 = 0xE0;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -30,7 +30,7 @@ use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
 use std::io::SeekFrom;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
@@ -260,6 +260,13 @@ fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
    }
 }

+pub fn main() {
+    let mut data_dir = PathBuf::new();
+    data_dir.push(".");
+    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
+    println!("wal_end={:?}", wal_end);
+}
+
 impl XLogRecord {
    pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -9,6 +9,7 @@ anyhow.workspace = true
 clap.workspace = true
 env_logger.workspace = true
 log.workspace = true
+once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
 camino-tempfile.workspace = true
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -53,7 +53,7 @@ impl Conf {

        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -8,8 +8,10 @@ license.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 itertools.workspace = true
+pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio = { workspace = true, features = ["io-util"] }
+tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,11 +13,14 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
+aws-credential-types.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
+humantime.workspace = true
 humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
+rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -185,7 +185,7 @@ mod tests {
    use super::*;

    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
-        let toml = input.parse::<toml_edit::DocumentMut>().unwrap();
+        let toml = input.parse::<toml_edit::Document>().unwrap();
        RemoteStorageConfig::from_toml(toml.as_item())
    }

--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,5 +6,6 @@ license.workspace = true

 [dependencies]
 serde.workspace = true
+serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -9,9 +9,8 @@ hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
-
-[dev-dependencies]
-tracing-subscriber.workspace = true    # For examples in docs
+tracing-subscriber.workspace = true
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -42,6 +42,7 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
+serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
--- a/libs/utils/src/accum.rs
+++ b/libs/utils/src/accum.rs
@@ -0,0 +1,33 @@
+/// A helper to "accumulate" a value similar to `Iterator::reduce`, but lets you
+/// feed the accumulated values by calling the 'accum' function, instead of having an
+/// iterator.
+///
+/// For example, to calculate the smallest value among some integers:
+///
+/// ```
+/// use utils::accum::Accum;
+///
+/// let values = [1, 2, 3];
+///
+/// let mut min_value: Accum<u32> = Accum(None);
+/// for new_value in &values {
+///     min_value.accum(std::cmp::min, *new_value);
+/// }
+///
+/// assert_eq!(min_value.0.unwrap(), 1);
+/// ```
+pub struct Accum<T>(pub Option<T>);
+impl<T: Copy> Accum<T> {
+    pub fn accum<F>(&mut self, func: F, new_value: T)
+    where
+        F: FnOnce(T, T) -> T,
+    {
+        // If there is no previous value, just store the new value.
+        // Otherwise call the function to decide which one to keep.
+        self.0 = Some(if let Some(accum) = self.0 {
+            func(accum, new_value)
+        } else {
+            new_value
+        });
+    }
+}
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -88,6 +88,12 @@ impl<'de> Deserialize<'de> for Id {
 }

 impl Id {
+    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
+        let mut arr = [0u8; 16];
+        buf.copy_to_slice(&mut arr);
+        Id::from(arr)
+    }
+
    pub fn from_slice(src: &[u8]) -> Result<Id, IdError> {
        if src.len() != 16 {
            return Err(IdError::SliceParseError(src.len()));
@@ -173,6 +179,10 @@ impl fmt::Debug for Id {
 macro_rules! id_newtype {
    ($t:ident) => {
        impl $t {
+            pub fn get_from_buf(buf: &mut impl bytes::Buf) -> $t {
+                $t(Id::get_from_buf(buf))
+            }
+
            pub fn from_slice(src: &[u8]) -> Result<$t, IdError> {
                Ok($t(Id::from_slice(src)?))
            }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -43,9 +43,16 @@ pub mod logging;
 pub mod lock_file;
 pub mod pid_file;

+// Misc
+pub mod accum;
+pub mod shutdown;
+
 // Utility for binding TcpListeners with proper socket options.
 pub mod tcp_listener;

+// Utility for putting a raw file descriptor into non-blocking mode
+pub mod nonblock;
+
 // Default signal handling
 pub mod sentry_init;
 pub mod signals;
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,9 +3,11 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, VariantNames};
+use strum_macros::{EnumString, EnumVariantNames};

-#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(
+    EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy,
+)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
    Plain,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,5 +1,6 @@
 #![warn(missing_docs)]

+use camino::Utf8Path;
 use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
@@ -144,6 +145,14 @@ impl Lsn {
        i128::from(self.0) - i128::from(other)
    }

+    /// Parse an LSN from a filename in the form `0000000000000000`
+    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
+    where
+        F: AsRef<Utf8Path>,
+    {
+        Lsn::from_hex(filename.as_ref().as_str())
+    }
+
    /// Parse an LSN from a string in the form `0000000000000000`
    pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
    where
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -0,0 +1,17 @@
+use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
+use std::os::unix::io::RawFd;
+
+/// Put a file descriptor into non-blocking mode
+pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
+    let bits = fcntl(fd, F_GETFL)?;
+
+    // If F_GETFL returns some unknown bits, they should be valid
+    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
+    // would effectively clear them, which is not what we want.
+    let mut flags = OFlag::from_bits_retain(bits);
+    flags |= OFlag::O_NONBLOCK;
+
+    fcntl(fd, F_SETFL(flags))?;
+
+    Ok(())
+}
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -0,0 +1,7 @@
+/// Immediately terminate the calling process without calling
+/// atexit callbacks, C runtime destructors etc. We mainly use
+/// this to protect coverage data from concurrent writes.
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
+    unsafe { nix::libc::_exit(code as _) };
+}
--- a/libs/utils/src/toml_edit_ext.rs
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -10,7 +10,7 @@ pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
 where
    T: serde::de::DeserializeOwned,
 {
-    let document: toml_edit::DocumentMut = match item {
+    let document: toml_edit::Document = match item {
        toml_edit::Item::Table(toml) => toml.clone().into(),
        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
            toml.clone().into_table().into()
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -15,11 +15,13 @@ anyhow.workspace = true
 axum.workspace = true
 clap.workspace = true
 futures.workspace = true
+inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
 tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -5,8 +5,6 @@ use std::{env, path::PathBuf, process::Command};

 use anyhow::{anyhow, Context};

-const WALPROPOSER_PG_VERSION: &str = "v17";
-
 fn main() -> anyhow::Result<()> {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
    println!("cargo:rerun-if-changed=bindgen_deps.h");
@@ -38,10 +36,7 @@ fn main() -> anyhow::Result<()> {
    // Rebuild crate when libwalproposer.a changes
    println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");

-    let pg_config_bin = pg_install_abs
-        .join(WALPROPOSER_PG_VERSION)
-        .join("bin")
-        .join("pg_config");
+    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
    let inc_server_path: String = if pg_config_bin.exists() {
        let output = Command::new(pg_config_bin)
            .arg("--includedir-server")
@@ -58,7 +53,7 @@ fn main() -> anyhow::Result<()> {
            .into()
    } else {
        let server_path = pg_install_abs
-            .join(WALPROPOSER_PG_VERSION)
+            .join("v16")
            .join("include")
            .join("postgresql")
            .join("server")
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -15,6 +15,7 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
+async-trait.workspace = true
 bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
@@ -22,9 +23,12 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
+crossbeam-utils.workspace = true
 either.workspace = true
+flate2.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
@@ -53,6 +57,10 @@ serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
+signal-hook.workspace = true
+smallvec = { workspace = true, features = ["write"] }
+svg_fmt.workspace = true
+sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
@@ -65,6 +73,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -142,11 +142,16 @@ impl PagestreamClient {
    ) -> anyhow::Result<PagestreamGetPageResponse> {
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
-        // let mut req = tokio_util::io::ReaderStream::new(&req);
-        let mut req = tokio_stream::once(Ok(req));

-        self.copy_both.send_all(&mut req).await?;
+        for i in 0..10 {
+            let mut req = tokio_stream::once(Ok(req.clone()));
+            self.copy_both.send_all(&mut req).await?;
+        }

+        for i in 0..9 {
+            let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+            let next: bytes::Bytes = next.unwrap()?;
+        }
        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
        let next: bytes::Bytes = next.unwrap()?;

--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -9,19 +9,41 @@ default = []

 [dependencies]
 anyhow.workspace = true
+async-compression.workspace = true
 async-stream.workspace = true
+byteorder.workspace = true
+bytes.workspace = true
+chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
+const_format.workspace = true
+consumption_metrics.workspace = true
+crossbeam-utils.workspace = true
+either.workspace = true
+flate2.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pin-project-lite.workspace = true
 rand.workspace = true
+smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
+thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
+tokio-util.workspace = true
 tracing.workspace = true
+tracing-error.workspace = true
 tracing-subscriber.workspace = true
+url.workspace = true
+walkdir.workspace = true
+metrics.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true

--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
@@ -23,4 +24,5 @@ toml_edit.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+serde.workspace = true
 serde_json.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -79,24 +79,16 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
        return None;
    }
    let keys: Vec<&str> = split[0].split('-').collect();
-    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
-    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
-    let the_lsns: [&str; 2];
-
-    /*
-     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
-     * parse 'vX' as an LSN.
-     */
-    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
-        the_lsns = [lsns[0], lsns[0]];
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    let is_delta = if lsns.len() == 1 {
+        lsns.push(lsns[0]);
        false
    } else {
-        the_lsns = [lsns[0], lsns[1]];
        true
    };

    let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
+    let lsn_range = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
    let holes = Vec::new();
    Some(LayerFile {
        key_range,
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -174,7 +174,7 @@ async fn main() -> anyhow::Result<()> {
                println!("specified prefix '{}' failed validation", cmd.prefix);
                return Ok(());
            };
-            let toml_document = toml_edit::DocumentMut::from_str(&cmd.config_toml_str)?;
+            let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?;
            let toml_item = toml_document
                .get("remote_storage")
                .expect("need remote_storage");
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -30,8 +30,9 @@ use pageserver_api::reltag::{RelTag, SlruKind};

 use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
-use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA};
+use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
+use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
@@ -254,11 +255,8 @@ where

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;

-        let pgversion = self.timeline.pg_version;
-        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
-
        // Create pgdata subdirs structure
-        for dir in subdirs.iter() {
+        for dir in PGDATA_SUBDIRS.iter() {
            let header = new_tar_header_dir(dir)?;
            self.ar
                .append(&header, &mut io::empty())
@@ -608,7 +606,7 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: u64) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
@@ -619,11 +617,7 @@ where
        buf.extend_from_slice(&img[..]);
        let crc = crc32c::crc32c(&img[..]);
        buf.put_u32_le(crc);
-        let path = if self.timeline.pg_version < 17 {
-            format!("pg_twophase/{:>08X}", xid)
-        } else {
-            format!("pg_twophase/{:>016X}", xid)
-        };
+        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
        self.ar
            .append(&header, &buf[..])
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -281,7 +281,7 @@ impl PageServerConf {

        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -178,7 +178,7 @@ async fn collect_metrics(
                )
                .await;
                if let Err(e) = res {
-                    tracing::error!("failed to upload to remote storage: {e:#}");
+                    tracing::error!("failed to upload to S3: {e:#}");
                }
            }
        };
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -580,11 +580,9 @@ async fn import_file(
        import_slru(modification, slru, file_path, reader, len, ctx).await?;
        debug!("imported multixact members slru");
    } else if file_path.starts_with("pg_twophase") {
-        let bytes = read_all_bytes(reader).await?;
+        let xid = u32::from_str_radix(file_name.as_ref(), 16)?;

-        // In PostgreSQL v17, this is a 64-bit FullTransactionid. In previous versions,
-        // it's a 32-bit TransactionId, which fits in u64 anyway.
-        let xid = u64::from_str_radix(file_name.as_ref(), 16)?;
+        let bytes = read_all_bytes(reader).await?;
        modification
            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
            .await?;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, VariantNames};
-use strum_macros::{IntoStaticStr, VariantNames};
+use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;

@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];

 // Metrics collected on operations on the storage repository.
-#[derive(Debug, VariantNames, IntoStaticStr)]
+#[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
@@ -1185,6 +1185,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
    ctx: &'c RequestContext,
    start: std::time::Instant,
    op: SmgrQueryType,
+    count: usize,
 }

 impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
@@ -1212,9 +1213,11 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                elapsed
            }
        };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        for _ in 0..self.count {
+            self.global_metric.observe(ex_throttled.as_secs_f64());
+            if let Some(timeline_metric) = self.timeline_metric {
+                timeline_metric.observe(ex_throttled.as_secs_f64());
+            }
        }
    }
 }
@@ -1343,6 +1346,14 @@ impl SmgrQueryTimePerTimeline {
        &'a self,
        op: SmgrQueryType,
        ctx: &'c RequestContext,
+    ) -> Option<impl Drop + '_> {
+        self.start_timer_many(op, 1, ctx)
+    }
+    pub(crate) fn start_timer_many<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        count: usize,
+        ctx: &'c RequestContext,
    ) -> Option<impl Drop + '_> {
        let global_metric = &self.global_metrics[op as usize];
        let start = Instant::now();
@@ -1376,6 +1387,7 @@ impl SmgrQueryTimePerTimeline {
            ctx,
            start,
            op,
+            count,
        })
    }
 }
@@ -1777,7 +1789,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
    .expect("failed to define a metric"),
    upload_heatmap_duration: register_histogram!(
        "pageserver_secondary_upload_heatmap_duration",
-        "Time to build and upload a heatmap, including any waiting inside the remote storage client"
+        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
@@ -3170,6 +3182,16 @@ static TOKIO_EXECUTOR_THREAD_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) static CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM: Lazy<Histogram> =
+    Lazy::new(|| {
+        register_histogram!(
+            "pageserver_consecutive_nonblocking_getpage_requests",
+            "Number of consecutive nonblocking getpage requests",
+            (0..=256).map(|x| x as f64).collect::<Vec<f64>>(),
+        )
+        .unwrap()
+    });
+
 pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
    static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(());
    let _guard = SERIALIZE.lock().unwrap();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -5,14 +5,14 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
-use once_cell::sync::OnceCell;
-use pageserver_api::models::TenantState;
+use once_cell::sync::{Lazy, OnceCell};
+use pageserver_api::models::{self, TenantState};
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
-    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse, PagestreamProtocolVersion,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
+    PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
+    PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
@@ -43,7 +43,7 @@ use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics;
+use crate::metrics::{self, CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM};
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -58,7 +58,7 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::reltag::SlruKind;
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -577,124 +577,317 @@ impl PageServerHandler {
            }
        }

-        loop {
-            // read request bytes (it's exactly 1 PagestreamFeMessage per CopyData)
-            let msg = tokio::select! {
-                biased;
-                _ = self.cancel.cancelled() => {
-                    return Err(QueryError::Shutdown)
-                }
-                msg = pgb.read_message() => { msg }
-            };
-            let copy_data_bytes = match msg? {
-                Some(FeMessage::CopyData(bytes)) => bytes,
-                Some(FeMessage::Terminate) => break,
-                Some(m) => {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "unexpected message: {m:?} during COPY"
-                    )));
-                }
-                None => break, // client disconnected
-            };
+        let mut batched = None;
+        'outer: loop {
+            enum DebouncedFeMessage {
+                Exists(models::PagestreamExistsRequest),
+                Nblocks(models::PagestreamNblocksRequest),
+                GetPage {
+                    span: Span,
+                    shard: timeline::handle::Handle<TenantManagerTypes>,
+                    effective_request_lsn: Lsn,
+                    pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+                },
+                DbSize(models::PagestreamDbSizeRequest),
+                GetSlruSegment(models::PagestreamGetSlruSegmentRequest),
+                RespondError(Span, PageStreamError),
+            }
+            let mut debounce: Option<std::time::Instant> = None;
+            // return or `?` on protocol error
+            // `break EXPR` to stop batching. The EXPR will be the first message in the next batch.
+            let next_batched: Option<DebouncedFeMessage> = loop {
+                static BOUNCE_TIMEOUT: Lazy<Duration> = Lazy::new(|| {
+                    utils::env::var::<humantime::Duration, _>("NEON_PAGESERVER_DEBOUNCE")
+                        .unwrap()
+                        .into()
+                });
+                let sleep_fut = if let Some(started_at) = debounce {
+                    futures::future::Either::Left(tokio::time::sleep_until(
+                        (started_at + *BOUNCE_TIMEOUT).into(),
+                    ))
+                } else {
+                    futures::future::Either::Right(futures::future::pending())
+                };
+                let msg = tokio::select! {
+                    biased;
+                    _ = self.cancel.cancelled() => {
+                        return Err(QueryError::Shutdown)
+                    }
+                    msg = pgb.read_message() => {
+                        msg
+                    }
+                    _ = sleep_fut => {
+                        assert!(batched.is_some());
+                        break None;
+                    }
+                };
+                let copy_data_bytes = match msg? {
+                    Some(FeMessage::CopyData(bytes)) => bytes,
+                    Some(FeMessage::Terminate) => break 'outer,
+                    Some(m) => {
+                        return Err(QueryError::Other(anyhow::anyhow!(
+                            "unexpected message: {m:?} during COPY"
+                        )));
+                    }
+                    None => break 'outer, // client disconnected
+                };
+                trace!("query: {copy_data_bytes:?}");
+                fail::fail_point!("ps::handle-pagerequest-message");

-            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");
+                // parse request
+                let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

-            // parse request
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+                let this_msg = match neon_fe_msg {
+                    PagestreamFeMessage::Exists(msg) => DebouncedFeMessage::Exists(msg),
+                    PagestreamFeMessage::Nblocks(msg) => DebouncedFeMessage::Nblocks(msg),
+                    PagestreamFeMessage::DbSize(msg) => DebouncedFeMessage::DbSize(msg),
+                    PagestreamFeMessage::GetSlruSegment(msg) => {
+                        DebouncedFeMessage::GetSlruSegment(msg)
+                    }
+                    PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                        request_lsn,
+                        not_modified_since,
+                        rel,
+                        blkno,
+                    }) => {
+                        let span = tracing::info_span!("handle_get_page_at_lsn_request_batched", %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, batch_size = tracing::field::Empty);
+                        let key = rel_block_to_key(rel, blkno);
+                        let shard = match self
+                            .timeline_handles
+                            .get(tenant_id, timeline_id, ShardSelector::Page(key))
+                            .instrument(span.clone())
+                            .await
+                        {
+                            Ok(tl) => tl,
+                            Err(GetActiveTimelineError::Tenant(
+                                GetActiveTenantError::NotFound(_),
+                            )) => {
+                                // We already know this tenant exists in general, because we resolved it at
+                                // start of connection.  Getting a NotFound here indicates that the shard containing
+                                // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                                // mapping is out of date.
+                                //
+                                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                                // and talk to a different pageserver.
+                                break Some(DebouncedFeMessage::RespondError(
+                                    span,
+                                    PageStreamError::Reconnect(
+                                        "getpage@lsn request routed to wrong shard".into(),
+                                    ),
+                                ));
+                            }
+                            Err(e) => break Some(DebouncedFeMessage::RespondError(span, e.into())),
+                        };
+                        let effective_request_lsn = match Self::wait_or_get_last_lsn(
+                            &shard,
+                            request_lsn,
+                            not_modified_since,
+                            &shard.get_latest_gc_cutoff_lsn(),
+                            &ctx,
+                        )
+                        // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
+                        .await
+                        {
+                            Ok(lsn) => lsn,
+                            Err(e) => {
+                                break Some(DebouncedFeMessage::RespondError(span, e));
+                            }
+                        };
+                        DebouncedFeMessage::GetPage {
+                            span,
+                            shard,
+                            effective_request_lsn,
+                            pages: smallvec::smallvec![(rel, blkno)],
+                        }
+                    }
+                };
+
+                // check if we can debounce
+                match (&mut batched, this_msg) {
+                    (None, this_msg) => {
+                        batched = Some(this_msg);
+                    }
+                    (
+                        Some(DebouncedFeMessage::GetPage {
+                            span: _,
+                            shard: accum_shard,
+                            pages: accum_pages,
+                            effective_request_lsn: accum_lsn,
+                        }),
+                        DebouncedFeMessage::GetPage {
+                            span: _,
+                            shard: this_shard,
+                            pages: this_pages,
+                            effective_request_lsn: this_lsn,
+                        },
+                    ) if async {
+                        assert_eq!(this_pages.len(), 1);
+                        if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
+                            assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
+                            return false;
+                        }
+                        if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
+                            != (this_shard.tenant_shard_id, this_shard.timeline_id)
+                        {
+                            // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                            // But the current logig for keeping responses in order does not support that.
+                            return false;
+                        }
+                        // the vectored get currently only supports a single LSN, so, bounce as soon
+                        // as the effective request_lsn changes
+                        return *accum_lsn == this_lsn;
+                    }
+                    .await =>
+                    {
+                        // ok to batch
+                        accum_pages.extend(this_pages);
+                    }
+                    (Some(_), this_msg) => {
+                        // by default, don't continue batching
+                        break Some(this_msg);
+                    }
+                }
+
+                // debounce impl piece
+                let started_at = debounce.get_or_insert_with(Instant::now);
+                if started_at.elapsed() > *BOUNCE_TIMEOUT {
+                    break None;
+                }
+            };

            // invoke handler function
-            let (handler_result, span) = match neon_fe_msg {
-                PagestreamFeMessage::Exists(req) => {
+            let (handler_results, span): (
+                smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]>,
+                _,
+            ) = match batched.take().expect("loop above ensures this") {
+                DebouncedFeMessage::Exists(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
-                            .instrument(span.clone())
-                            .await,
+                        smallvec::smallvec![
+                            self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
+                                .instrument(span.clone())
+                                .await
+                        ],
                        span,
                    )
                }
-                PagestreamFeMessage::Nblocks(req) => {
+                DebouncedFeMessage::Nblocks(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
-                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
-                            .instrument(span.clone())
-                            .await,
+                        smallvec::smallvec![
+                            self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
                        span,
                    )
                }
-                PagestreamFeMessage::GetPage(req) => {
+                DebouncedFeMessage::GetPage {
+                    span,
+                    shard,
+                    effective_request_lsn,
+                    pages,
+                } => {
+                    CONSECUTIVE_NONBLOCKING_GETPAGE_REQUESTS_HISTOGRAM.observe(pages.len() as f64);
+                    span.record("batch_size", pages.len() as u64);
                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
-                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
-                            .instrument(span.clone())
-                            .await,
+                        {
+                            let npages = pages.len();
+                            let res = self
+                                .handle_get_page_at_lsn_request_batched(
+                                    &shard,
+                                    effective_request_lsn,
+                                    pages,
+                                    &ctx,
+                                )
+                                .instrument(span.clone())
+                                .await;
+                            assert_eq!(res.len(), npages);
+                            res
+                        },
                        span,
                    )
                }
-                PagestreamFeMessage::DbSize(req) => {
+                DebouncedFeMessage::DbSize(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
-                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
-                            .instrument(span.clone())
-                            .await,
+                        smallvec::smallvec![
+                            self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
+                                .instrument(span.clone())
+                                .await
+                        ],
                        span,
                    )
                }
-                PagestreamFeMessage::GetSlruSegment(req) => {
+                DebouncedFeMessage::GetSlruSegment(req) => {
                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
-                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
+                        smallvec::smallvec![
+                            self.handle_get_slru_segment_request(
+                                tenant_id,
+                                timeline_id,
+                                &req,
+                                &ctx
+                            )
                            .instrument(span.clone())
-                            .await,
+                            .await
+                        ],
                        span,
                    )
                }
+                DebouncedFeMessage::RespondError(span, e) => {
+                    // We've already decided to respond with an error, so we don't need to
+                    // call the handler.
+                    (smallvec::smallvec![Err(e)], span)
+                }
            };

            // Map handler result to protocol behavior.
            // Some handler errors cause exit from pagestream protocol.
            // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-            let response_msg = match handler_result {
-                Err(e) => match &e {
-                    PageStreamError::Shutdown => {
-                        // If we fail to fulfil a request during shutdown, which may be _because_ of
-                        // shutdown, then do not send the error to the client.  Instead just drop the
-                        // connection.
-                        span.in_scope(|| info!("dropping connection due to shutdown"));
-                        return Err(QueryError::Shutdown);
-                    }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                        return Err(QueryError::Reconnect);
-                    }
-                    PageStreamError::Read(_)
-                    | PageStreamError::LsnTimeout(_)
-                    | PageStreamError::NotFound(_)
-                    | PageStreamError::BadRequest(_) => {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e);
-                        span.in_scope(|| {
-                            error!("error reading relation or page version: {full:#}")
-                        });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
-                    }
-                },
-                Ok(response_msg) => response_msg,
-            };
+            for handler_result in handler_results {
+                let response_msg = match handler_result {
+                    Err(e) => match &e {
+                        PageStreamError::Shutdown => {
+                            // If we fail to fulfil a request during shutdown, which may be _because_ of
+                            // shutdown, then do not send the error to the client.  Instead just drop the
+                            // connection.
+                            span.in_scope(|| info!("dropping connection due to shutdown"));
+                            return Err(QueryError::Shutdown);
+                        }
+                        PageStreamError::Reconnect(reason) => {
+                            span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                            return Err(QueryError::Reconnect);
+                        }
+                        PageStreamError::Read(_)
+                        | PageStreamError::LsnTimeout(_)
+                        | PageStreamError::NotFound(_)
+                        | PageStreamError::BadRequest(_) => {
+                            // print the all details to the log with {:#}, but for the client the
+                            // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                            // here includes cancellation which is not an error.
+                            let full = utils::error::report_compact_sources(&e);
+                            span.in_scope(|| {
+                                error!("error reading relation or page version: {full:#}")
+                            });
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            })
+                        }
+                    },
+                    Ok(response_msg) => response_msg,
+                };

-            // marshal & transmit response message
-            pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+                // marshal & transmit response message
+                pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+            }
            tokio::select! {
                biased;
                _ = self.cancel.cancelled() => {
@@ -706,6 +899,9 @@ impl PageServerHandler {
                    res?;
                }
            }
+
+            assert!(batched.is_none(), "we take() earlier");
+            batched = next_batched;
        }
        Ok(())
    }
@@ -949,60 +1145,30 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip_all, fields(shard_id))]
-    async fn handle_get_page_at_lsn_request(
+    #[instrument(skip_all)]
+    async fn handle_get_page_at_lsn_request_batched(
        &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        req: &PagestreamGetPageRequest,
+        timeline: &Timeline,
+        effective_lsn: Lsn,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = match self
-            .timeline_handles
-            .get(
-                tenant_id,
-                timeline_id,
-                ShardSelector::Page(rel_block_to_key(req.rel, req.blkno)),
-            )
-            .await
-        {
-            Ok(tl) => tl,
-            Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                // We already know this tenant exists in general, because we resolved it at
-                // start of connection.  Getting a NotFound here indicates that the shard containing
-                // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                // mapping is out of date.
-                //
-                // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                // and talk to a different pageserver.
-                return Err(PageStreamError::Reconnect(
-                    "getpage@lsn request routed to wrong shard".into(),
-                ));
-            }
-            Err(e) => return Err(e.into()),
-        };
-
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);
-
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
+    ) -> smallvec::SmallVec<[Result<PagestreamBeMessage, PageStreamError>; 1]> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let _timer = timeline.query_metrics.start_timer_many(
+            metrics::SmgrQueryType::GetPageAtLsn,
+            pages.len(),
            ctx,
-        )
-        .await?;
+        );

-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
-            .await?;
+        let pages = timeline
+            .get_rel_page_at_lsn_batched(pages, Version::Lsn(effective_lsn), ctx)
+            .await;

-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
+        smallvec::SmallVec::from_iter(pages.into_iter().map(|page| {
+            page.map(|page| {
+                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
+            })
+            .map_err(PageStreamError::Read)
        }))
    }

@@ -1199,6 +1365,7 @@ impl PageServerHandler {
    }
 }

+#[async_trait::async_trait]
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1498,3 +1665,10 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
    );
    debug_assert_current_span_has_tenant_and_timeline_id();
 }
+
+struct WaitedForLsn(Lsn);
+impl From<WaitedForLsn> for Lsn {
+    fn from(WaitedForLsn(lsn): WaitedForLsn) -> Self {
+        lsn
+    }
+}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,12 +9,17 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
+use crate::span::{
+    debug_assert_current_span_has_tenant_and_timeline_id,
+    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
+};
+use crate::tenant::timeline::GetVectoredError;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -28,7 +33,7 @@ use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::collections::{hash_map, HashMap, HashSet};
+use std::collections::{hash_map, BTreeMap, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
@@ -191,26 +196,184 @@ impl Timeline {
        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
-        if tag.relnode == 0 {
-            return Err(PageReconstructError::Other(
-                RelationError::InvalidRelnode.into(),
-            ));
-        }
+        let pages = smallvec::smallvec![(tag, blknum)];
+        let res = self.get_rel_page_at_lsn_batched(pages, version, ctx).await;
+        assert_eq!(res.len(), 1);
+        res.into_iter().next().unwrap()
+    }

-        let nblocks = self.get_rel_size(tag, version, ctx).await?;
-        if blknum >= nblocks {
-            debug!(
-                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag,
-                blknum,
-                version.get_lsn(),
-                nblocks
-            );
-            return Ok(ZERO_PAGE.clone());
+    /// Like [`get_rel_page_at_lsn`], but returns a batch of pages.
+    pub(crate) async fn get_rel_page_at_lsn_batched(
+        &self,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        version: Version<'_>,
+        ctx: &RequestContext,
+    ) -> smallvec::SmallVec<[Result<Bytes, PageReconstructError>; 1]> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let request_lsn = match version {
+            Version::Lsn(lsn) => lsn,
+            Version::Modified(_) => panic!("unsupported"),
+        };
+        enum KeyState {
+            NeedsVectoredGet,
+            Done(Result<Bytes, PageReconstructError>),
        }
+        let mut key_states = BTreeMap::new();
+        let mut vectored_gets: smallvec::SmallVec<[_; 1]> =
+            smallvec::SmallVec::with_capacity(pages.len());
+        for (response_order, (tag, blknum)) in pages.into_iter().enumerate() {
+            let key = rel_block_to_key(tag, blknum);
+            use std::collections::btree_map::Entry;
+            let key_state_slot = match key_states.entry((key, response_order)) {
+                Entry::Occupied(_entry) => unreachable!(
+                    "enumerate makes keys unique, even if batch contains same key twice"
+                ),
+                Entry::Vacant(entry) => entry,
+            };

-        let key = rel_block_to_key(tag, blknum);
-        version.get(self, key, ctx).await
+            if tag.relnode == 0 {
+                key_state_slot.insert(KeyState::Done(Err(PageReconstructError::Other(
+                    RelationError::InvalidRelnode.into(),
+                ))));
+                continue;
+            }
+
+            let nblocks = match self.get_rel_size(tag, version, ctx).await {
+                Ok(nblocks) => nblocks,
+                Err(err) => {
+                    key_state_slot.insert(KeyState::Done(Err(err)));
+                    continue;
+                }
+            };
+            if blknum >= nblocks {
+                debug!(
+                    "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
+                    tag,
+                    blknum,
+                    version.get_lsn(),
+                    nblocks
+                );
+                key_state_slot.insert(KeyState::Done(Ok(ZERO_PAGE.clone())));
+                continue;
+            }
+
+            vectored_gets.push(key);
+            key_state_slot.insert(KeyState::NeedsVectoredGet);
+        }
+        // turn vectored_gets into a keyspace
+        let keyspace = {
+            // add_key reuqires monotonicity
+            vectored_gets.sort_unstable();
+            let mut acc = KeySpaceAccum::new();
+            for key in vectored_gets
+                .into_iter()
+                // in fact it requires strong monotonicity
+                .dedup()
+            {
+                acc.add_key(key);
+            }
+            acc.to_keyspace()
+        };
+
+        match self.get_vectored(keyspace, request_lsn, ctx).await {
+            Ok(results) => {
+                for (key, res) in results {
+                    if let Err(err) = &res {
+                        warn!(%key, ?err, "a key inside get_vectored failed with a per-key error");
+                    }
+                    let mut interests = key_states.range_mut((key, 0)..(key.next(), 0)).peekable();
+                    let first_interest = interests.next().unwrap();
+                    let next_interest = interests.peek().is_some();
+                    if !next_interest {
+                        match first_interest.1 {
+                            KeyState::NeedsVectoredGet => {
+                                *first_interest.1 = KeyState::Done(res);
+                            }
+                            KeyState::Done(_) => unreachable!(),
+                        }
+                        continue;
+                    } else {
+                        for ((_, _), state) in [first_interest].into_iter().chain(interests) {
+                            match state {
+                                KeyState::NeedsVectoredGet => {
+                                    *state = KeyState::Done(match &res {
+                                        Ok(buf) => Ok(buf.clone()),
+                                        // this `match` is working around the fact that we cannot Clone the PageReconstructError
+                                        Err(err) => Err(match err {
+                                            PageReconstructError::Cancelled => {
+                                                PageReconstructError::Cancelled
+                                            }
+
+                                            x @ PageReconstructError::Other(_) |
+                                            x @ PageReconstructError::AncestorLsnTimeout(_) |
+                                            x @ PageReconstructError::WalRedo(_) |
+                                            x @ PageReconstructError::MissingKey(_) => {
+                                                PageReconstructError::Other(anyhow::anyhow!("there was more than one request for this key in the batch, error logged once: {x:?}"))
+                                            },
+                                        }),
+                                    });
+                                }
+                                KeyState::Done(_) => unreachable!(),
+                            }
+                        }
+                    }
+                }
+            }
+            Err(err) => {
+                warn!(?err, "get_vectored failed with a global error, mapping that error to per-key failure");
+                // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
+                for ((_, _), state) in key_states.iter_mut() {
+                    // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
+                    // but without taking ownership of the GetVectoredError
+                    match &err {
+                        GetVectoredError::Cancelled => {
+                            *state = KeyState::Done(Err(PageReconstructError::Cancelled));
+                        }
+                        // TODO: restructure get_vectored API to make this error per-key
+                        GetVectoredError::MissingKey(err) => {
+                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more of the requested keys were missing: {err:?}"))));
+                        }
+                        // TODO: restructure get_vectored API to make this error per-key
+                        GetVectoredError::GetReadyAncestorError(err) => {
+                            *state = KeyState::Done(Err(PageReconstructError::Other(anyhow::anyhow!("whole vectored get request failed because one or more key required ancestor that wasn't ready: {err:?}"))));
+                        }
+                        // TODO: restructure get_vectored API to make this error per-key
+                        GetVectoredError::Other(err) => {
+                            *state = KeyState::Done(Err(PageReconstructError::Other(
+                                anyhow::anyhow!("whole vectored get request failed: {err:?}"),
+                            )));
+                        }
+                        // TODO: we can prevent this error class by moving this check into the type system
+                        GetVectoredError::InvalidLsn(e) => {
+                            *state =
+                                KeyState::Done(Err(anyhow::anyhow!("invalid LSN: {e:?}").into()));
+                        }
+                        // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
+                        // TODO: we can prevent this error class by moving this check into the type system
+                        GetVectoredError::Oversized(err) => {
+                            *state = KeyState::Done(Err(anyhow::anyhow!(
+                                "batching oversized: {err:?}"
+                            )
+                            .into()));
+                        }
+                    }
+                }
+            }
+        };
+
+        // get the results into the order in which they were requested
+        let mut return_order: smallvec::SmallVec<[_; Timeline::MAX_GET_VECTORED_KEYS as usize]> =
+            smallvec::SmallVec::with_capacity(key_states.len());
+        return_order.extend(key_states.keys().map(|(key, idx)| (*key, *idx)));
+        return_order.sort_unstable_by_key(|(_, idx)| *idx);
+        let mut res = smallvec::SmallVec::with_capacity(key_states.len());
+        res.extend(return_order.into_iter().map(|key_states_key| {
+            match key_states.remove(&key_states_key).unwrap() {
+                KeyState::Done(res) => res,
+                KeyState::NeedsVectoredGet => unreachable!(),
+            }
+        }));
+        res
    }

    // Get size of a database in blocks
@@ -633,7 +796,7 @@ impl Timeline {

    pub(crate) async fn get_twophase_file(
        &self,
-        xid: u64,
+        xid: TransactionId,
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -646,19 +809,11 @@ impl Timeline {
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<HashSet<u64>, PageReconstructError> {
+    ) -> Result<HashSet<TransactionId>, PageReconstructError> {
        // fetch directory entry
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;

-        if self.pg_version >= 17 {
-            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
-        } else {
-            Ok(TwoPhaseDirectory::des(&buf)?
-                .xids
-                .iter()
-                .map(|x| u64::from(*x))
-                .collect())
-        }
+        Ok(TwoPhaseDirectory::des(&buf)?.xids)
    }

    pub(crate) async fn get_control_file(
@@ -910,13 +1065,9 @@ impl Timeline {

        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
-
-        let mut xids: Vec<u64> = self
-            .list_twophase_files(lsn, ctx)
-            .await?
-            .iter()
-            .cloned()
-            .collect();
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
+        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
            result.add_key(twophase_file_key(xid));
@@ -1139,15 +1290,9 @@ impl<'a> DatadirModification<'a> {
        // Create AuxFilesDirectory
        self.init_aux_dir()?;

-        let buf = if self.tline.pg_version >= 17 {
-            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
-                xids: HashSet::new(),
-            })
-        } else {
-            TwoPhaseDirectory::ser(&TwoPhaseDirectory {
-                xids: HashSet::new(),
-            })
-        }?;
+        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
+            xids: HashSet::new(),
+        })?;
        self.pending_directory_entries
            .push((DirectoryKind::TwoPhase, 0));
        self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
@@ -1223,13 +1368,6 @@ impl<'a> DatadirModification<'a> {
        img: Bytes,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let key = rel_block_to_key(rel, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
-        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -1241,34 +1379,14 @@ impl<'a> DatadirModification<'a> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> anyhow::Result<()> {
-        let key = slru_block_to_key(kind, segno, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
-        }
-        self.put(key, Value::Image(img));
+        self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
        Ok(())
    }

-    pub(crate) fn put_rel_page_image_zero(
-        &mut self,
-        rel: RelTag,
-        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let key = rel_block_to_key(rel, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
-        }
-        self.pending_zero_data_pages.insert(key.to_compact());
+    pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) {
+        self.pending_zero_data_pages
+            .insert(rel_block_to_key(rel, blknum).to_compact());
        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
    }

    pub(crate) fn put_slru_page_image_zero(
@@ -1276,18 +1394,10 @@ impl<'a> DatadirModification<'a> {
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        let key = slru_block_to_key(kind, segno, blknum);
-        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
-        }
-        self.pending_zero_data_pages.insert(key.to_compact());
+    ) {
+        self.pending_zero_data_pages
+            .insert(slru_block_to_key(kind, segno, blknum).to_compact());
        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
    }

    /// Call this at the end of each WAL record.
@@ -1339,31 +1449,22 @@ impl<'a> DatadirModification<'a> {

    pub async fn put_twophase_file(
        &mut self,
-        xid: u64,
+        xid: TransactionId,
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Add it to the directory entry
-        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= 17 {
-            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
-            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
-            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
-            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
-        } else {
-            let xid = xid as u32;
-            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
-            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
-            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
-            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
-        };
-        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
+        let mut dir = TwoPhaseDirectory::des(&buf)?;
+        if !dir.xids.insert(xid) {
+            anyhow::bail!("twophase file for xid {} already exists", xid);
+        }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
+        self.put(
+            TWOPHASEDIR_KEY,
+            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
+        );

        self.put(twophase_file_key(xid), Value::Image(img));
        Ok(())
@@ -1666,32 +1767,22 @@ impl<'a> DatadirModification<'a> {
    /// This method is used for marking truncated SLRU files
    pub async fn drop_twophase_file(
        &mut self,
-        xid: u64,
+        xid: TransactionId,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= 17 {
-            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;
+        let mut dir = TwoPhaseDirectory::des(&buf)?;

-            if !dir.xids.remove(&xid) {
-                warn!("twophase file for xid {} does not exist", xid);
-            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
-            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
-        } else {
-            let xid: u32 = u32::try_from(xid)?;
-            let mut dir = TwoPhaseDirectory::des(&buf)?;
-
-            if !dir.xids.remove(&xid) {
-                warn!("twophase file for xid {} does not exist", xid);
-            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
-            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
-        };
-        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
+        if !dir.xids.remove(&xid) {
+            warn!("twophase file for xid {} does not exist", xid);
+        }
+        self.pending_directory_entries
+            .push((DirectoryKind::TwoPhase, dir.xids.len()));
+        self.put(
+            TWOPHASEDIR_KEY,
+            Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
+        );

        // Delete it
        self.delete(twophase_key_range(xid));
@@ -2161,21 +2252,11 @@ struct DbDirectory {
    dbdirs: HashMap<(Oid, Oid), bool>,
 }

-// The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of
-// pg_twophase files was expanded from 32-bit XIDs to 64-bit XIDs.  Previously, the files
-// were named like "pg_twophase/000002E5", now they're like
-// "pg_twophsae/0000000A000002E4".
-
 #[derive(Debug, Serialize, Deserialize)]
 struct TwoPhaseDirectory {
    xids: HashSet<TransactionId>,
 }

-#[derive(Debug, Serialize, Deserialize)]
-struct TwoPhaseDirectoryV17 {
-    xids: HashSet<u64>,
-}
-
 #[derive(Debug, Serialize, Deserialize, Default)]
 struct RelDirectory {
    // Set of relations that exist. (relfilenode, forknum)
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -73,6 +73,21 @@ impl ValueBytes {

        Ok(raw[8] == 1)
    }
+
+    pub(crate) fn is_image(raw: &[u8]) -> Result<bool, InvalidInput> {
+        if raw.len() < 12 {
+            return Err(InvalidInput::TooShortValue);
+        }
+
+        let value_discriminator = &raw[0..4];
+
+        if value_discriminator == [0, 0, 0, 0] {
+            // Value::Image always initializes
+            return Ok(true);
+        }
+
+        Ok(false)
+    }
 }

 #[cfg(test)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7091,13 +7091,13 @@ mod tests {
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
-                    key_range: Key::MIN..Key::MAX,
+                    key_range: Key::MIN..Key::NON_L0_MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
-                // The delta layer below the horizon
+                // The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
                PersistentLayerKey {
-                    key_range: get_key(3)..get_key(4),
+                    key_range: Key::MIN..Key::NON_L0_MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x48),
                    is_delta: true
                },
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -452,8 +452,7 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
            toml_edit::Item::Table(table) => {
-                let deserializer =
-                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
+                let deserializer = toml_edit::de::Deserializer::new(table.into());
                return serde_path_to_error::deserialize(deserializer)
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,15 +8,17 @@ mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;

+use tokio::sync::{self};
+use utils::bin_ser::BeSer;
 pub mod split_writer;

 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Value;
+use crate::repository::{Value, ValueBytes};
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
-use pageserver_api::key::Key;
+use pageserver_api::key::{Key, DBDIR_KEY};
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
-use std::cmp::{Ordering, Reverse};
+use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
@@ -79,12 +81,18 @@ pub(crate) enum ValueReconstructSituation {
 }

 /// Reconstruct data accumulated for a single key during a vectored get
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default)]
 pub(crate) struct VectoredValueReconstructState {
-    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
-    pub(crate) img: Option<(Lsn, Bytes)>,
+    pub(crate) records: Vec<(
+        Lsn,
+        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
+    )>,
+    pub(crate) img: Option<(
+        Lsn,
+        tokio::sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
+    )>,

-    situation: ValueReconstructSituation,
+    pub(crate) situation: ValueReconstructSituation,
 }

 impl VectoredValueReconstructState {
@@ -93,16 +101,57 @@ impl VectoredValueReconstructState {
    }
 }

-impl From<VectoredValueReconstructState> for ValueReconstructState {
-    fn from(mut state: VectoredValueReconstructState) -> Self {
-        // walredo expects the records to be descending in terms of Lsn
-        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+pub(crate) async fn convert(
+    _key: Key,
+    from: VectoredValueReconstructState,
+) -> Result<ValueReconstructState, PageReconstructError> {
+    let mut to = ValueReconstructState::default();

-        ValueReconstructState {
-            records: state.records,
-            img: state.img,
+    for (lsn, fut) in from.records {
+        match fut.await {
+            Ok(res) => match res {
+                Ok(bytes) => {
+                    let value = Value::des(&bytes)
+                        .map_err(|err| PageReconstructError::Other(err.into()))?;
+
+                    match value {
+                        Value::WalRecord(rec) => {
+                            to.records.push((lsn, rec));
+                        },
+                        Value::Image(img) => {
+                            assert!(to.img.is_none());
+                            to.img = Some((lsn, img));
+                        }
+                    }
+                }
+                Err(err) => {
+                    return Err(PageReconstructError::Other(err.into()));
+                }
+            },
+            Err(err) => {
+                return Err(PageReconstructError::Other(err.into()));
+            }
        }
    }
+
+    if to.img.is_none() && from.img.is_some() {
+        let (lsn, fut) = from.img.expect("Has an image");
+        match fut.await {
+            Ok(res) => match res {
+                Ok(bytes) => {
+                    to.img = Some((lsn, bytes));
+                }
+                Err(err) => {
+                    return Err(PageReconstructError::Other(err.into()));
+                }
+            },
+            Err(err) => {
+                return Err(PageReconstructError::Other(err.into()));
+            }
+        }
+    }
+
+    Ok(to)
 }

 /// Bag of data accumulated during a vectored get..
@@ -200,7 +249,8 @@ impl ValuesReconstructState {
        &mut self,
        key: &Key,
        lsn: Lsn,
-        value: Value,
+        completes: bool,
+        value: sync::oneshot::Receiver<Result<Bytes, std::io::Error>>,
    ) -> ValueReconstructSituation {
        let state = self
            .keys
@@ -208,31 +258,14 @@ impl ValuesReconstructState {
            .or_insert(Ok(VectoredValueReconstructState::default()));

        if let Ok(state) = state {
-            let key_done = match state.situation {
+            match state.situation {
                ValueReconstructSituation::Complete => unreachable!(),
-                ValueReconstructSituation::Continue => match value {
-                    Value::Image(img) => {
-                        state.img = Some((lsn, img));
-                        true
-                    }
-                    Value::WalRecord(rec) => {
-                        debug_assert!(
-                            Some(lsn) > state.get_cached_lsn(),
-                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
-                            lsn,
-                            state
-                                .get_cached_lsn()
-                                .expect("Assertion can only fire if a cached lsn is present")
-                        );
+                ValueReconstructSituation::Continue => {
+                    state.records.push((lsn, value));
+                }
+            }

-                        let will_init = rec.will_init();
-                        state.records.push((lsn, rec));
-                        will_init
-                    }
-                },
-            };
-
-            if key_done && state.situation == ValueReconstructSituation::Continue {
+            if completes && state.situation == ValueReconstructSituation::Continue {
                state.situation = ValueReconstructSituation::Complete;
                self.keys_done.add_key(*key);
            }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,13 +42,12 @@ use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadCoalesceMode, VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::BytesMut;
+use anyhow::{bail, ensure, Context, Result};
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -58,14 +57,14 @@ use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
-use tokio::sync::OnceCell;
+use tokio::sync::{self, OnceCell};
 use tokio_epoll_uring::IoBuf;
 use tracing::*;

@@ -224,7 +223,7 @@ pub struct DeltaLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    file: VirtualFile,
+    file: Arc<VirtualFile>,
    file_id: FileId,

    layer_key_range: Range<Key>,
@@ -788,9 +787,11 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open(path, ctx)
+                .await
+                .context("open layer file")?,
+        );

        let file_id = page_cache::next_file_id();

@@ -980,77 +981,59 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) {
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut ignore_key_with_err = None;
-
        let max_vectored_read_bytes = self
            .max_vectored_read_bytes
            .expect("Layer is loaded with max vectored bytes config")
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
        // track when a key is done.
        for read in reads.into_iter().rev() {
-            let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
-                .await;
-
-            let blobs_buf = match res {
-                Ok(blobs_buf) => blobs_buf,
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::Other(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
-                                kind
-                            )),
-                        );
-                    }
-
-                    // We have "lost" the buffer since the lower level IO api
-                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
-
-                    continue;
-                }
-            };
-
-            for meta in blobs_buf.blobs.iter().rev() {
-                if Some(meta.meta.key) == ignore_key_with_err {
-                    continue;
-                }
-
-                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
-                let value = match value {
-                    Ok(v) => v,
-                    Err(e) => {
-                        reconstruct_state.on_key_error(
-                            meta.meta.key,
-                            PageReconstructError::Other(anyhow!(e).context(format!(
-                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path,
-                            ))),
-                        );
-
-                        ignore_key_with_err = Some(meta.meta.key);
-                        continue;
-                    }
-                };
-
-                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
-                // state, no further updates shall be made to it. The call below will
-                // panic if the invariant is violated.
-                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            let mut senders: HashMap<
+                (Key, Lsn),
+                sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
+            > = Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice().iter().rev() {
+                let (tx, rx) = sync::oneshot::channel();
+                senders.insert((blob_meta.key, blob_meta.lsn), tx);
+                reconstruct_state.update_key(
+                    &blob_meta.key,
+                    blob_meta.lsn,
+                    blob_meta.will_init,
+                    rx,
+                );
            }

-            buf = Some(blobs_buf.buf);
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            tokio::task::spawn(async move {
+                let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                let buf = BytesMut::with_capacity(buf_size);
+
+                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;
+                match res {
+                    Ok(blobs_buf) => {
+                        for meta in blobs_buf.blobs.iter().rev() {
+                            let buf = &blobs_buf.buf[meta.start..meta.end];
+                            let sender = senders
+                                .remove(&(meta.meta.key, meta.meta.lsn))
+                                .expect("sender must exist");
+                            let _ = sender.send(Ok(Bytes::copy_from_slice(buf)));
+                        }
+
+                        assert!(senders.is_empty());
+                    }
+                    Err(err) => {
+                        for (_, sender) in senders {
+                            let _ = sender
+                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
+                        }
+                    }
+                }
+            });
        }
    }

@@ -1190,7 +1173,14 @@ impl DeltaLayerInner {
            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
                let end_offset = offset;

-                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+                Some((
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init: false,
+                    },
+                    start_offset..end_offset,
+                ))
            } else {
                None
            };
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -21,7 +21,7 @@
 //!
 //! Every image layer file consists of three parts: "summary",
 //! "index", and "values".  The summary is a fixed size header at the
-//! beginning of the file, and it contains basic information about the
+//! beginningof the file, and it contains basic information about the
 //! layer, and offsets to the other parts. The "index" is a B-tree,
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
@@ -38,11 +38,11 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
 };
-use crate::tenant::PageReconstructError;
+use crate::tenant::Timeline;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
@@ -52,12 +52,14 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::VecDeque;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
+use std::sync::Arc;
+use tokio::sync::oneshot;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -69,7 +71,9 @@ use utils::{
 };

 use super::layer_name::ImageLayerName;
-use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
+use super::{
+    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -160,7 +164,7 @@ pub struct ImageLayerInner {
    key_range: Range<Key>,
    lsn: Lsn,

-    file: VirtualFile,
+    file: Arc<VirtualFile>,
    file_id: FileId,

    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
@@ -387,9 +391,11 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
-            .await
-            .context("open layer file")?;
+        let file = Arc::new(
+            VirtualFile::open(path, ctx)
+                .await
+                .context("open layer file")?,
+        );
        let file_id = page_cache::next_file_id();
        let block_reader = FileBlockReader::new(&file, file_id);
        let summary_blk = block_reader
@@ -576,8 +582,16 @@ impl ImageLayerInner {
            .0
            .into();

-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
        for read in reads.into_iter() {
+            let mut senders: HashMap<(Key, Lsn), oneshot::Sender<Result<Bytes, std::io::Error>>> =
+                Default::default();
+            for (_, blob_meta) in read.blobs_at.as_slice() {
+                let (tx, rx) = oneshot::channel();
+                senders.insert((blob_meta.key, blob_meta.lsn), tx);
+
+                reconstruct_state.update_key(&blob_meta.key, blob_meta.lsn, true, rx);
+            }
+
            let buf_size = read.size();

            if buf_size > max_vectored_read_bytes {
@@ -596,36 +610,36 @@ impl ImageLayerInner {
                );
            }

-            let buf = BytesMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
+            let read_from = self.file.clone();
+            let read_ctx = ctx.attached_child();
+            tokio::task::spawn(async move {
+                let buf = BytesMut::with_capacity(buf_size);
+                let vectored_blob_reader = VectoredBlobReader::new(&read_from);
+                let res = vectored_blob_reader.read_blobs(&read, buf, &read_ctx).await;

-            match res {
-                Ok(blobs_buf) => {
-                    let frozen_buf = blobs_buf.buf.freeze();
+                match res {
+                    Ok(blobs_buf) => {
+                        for meta in blobs_buf.blobs.iter().rev() {
+                            let buf = &blobs_buf.buf[meta.start..meta.end];
+                            let sender = senders
+                                .remove(&(meta.meta.key, meta.meta.lsn))
+                                .expect("sender must exist");
+                            // TODO: this is silly - sort it out
+                            let bytes = Value::ser(&Value::Image(Bytes::copy_from_slice(buf)))
+                                .expect("stupid but correct");
+                            let _ = sender.send(Ok(bytes.into()));
+                        }

-                    for meta in blobs_buf.blobs.iter() {
-                        let img_buf = frozen_buf.slice(meta.start..meta.end);
-                        reconstruct_state.update_key(
-                            &meta.meta.key,
-                            self.lsn,
-                            Value::Image(img_buf),
-                        );
+                        assert!(senders.is_empty());
+                    }
+                    Err(err) => {
+                        for (_, sender) in senders {
+                            let _ = sender
+                                .send(Err(std::io::Error::new(err.kind(), "vec read failed")));
+                        }
                    }
                }
-                Err(err) => {
-                    let kind = err.kind();
-                    for (_, blob_meta) in read.blobs_at.as_slice() {
-                        reconstruct_state.on_key_error(
-                            blob_meta.key,
-                            PageReconstructError::from(anyhow!(
-                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
-                                kind
-                            )),
-                        );
-                    }
-                }
-            };
+            });
        }
    }

@@ -797,9 +811,10 @@ impl ImageLayerWriterInner {
    ///
    async fn finish(
        self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        end_key: Option<Key>,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+    ) -> anyhow::Result<ResidentLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -875,9 +890,12 @@ impl ImageLayerWriterInner {
        // fsync the file
        file.sync_all().await?;

-        trace!("created image layer {}", self.path);
+        // FIXME: why not carry the virtualfile here, it supports renaming?
+        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        Ok((desc, self.path))
+        info!("created image layer {}", layer.local_path());
+
+        Ok(layer)
    }
 }

@@ -956,18 +974,24 @@ impl ImageLayerWriter {
    ///
    pub(crate) async fn finish(
        mut self,
+        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(ctx, None).await
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner.take().unwrap().finish(timeline, ctx, None).await
    }

    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
    pub(super) async fn finish_with_end_key(
        mut self,
+        timeline: &Arc<Timeline>,
        end_key: Key,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        self.inner.take().unwrap().finish(ctx, Some(end_key)).await
+    ) -> anyhow::Result<super::ResidentLayer> {
+        self.inner
+            .take()
+            .unwrap()
+            .finish(timeline, ctx, Some(end_key))
+            .await
    }
 }

@@ -1071,7 +1095,7 @@ mod test {
        tenant::{
            config::TenantConf,
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::{Layer, ResidentLayer},
+            storage_layer::ResidentLayer,
            vectored_blob_io::StreamingVectoredReadPlanner,
            Tenant, Timeline,
        },
@@ -1142,8 +1166,7 @@ mod test {

                key = key.next();
            }
-            let (desc, path) = writer.finish(&ctx).await.unwrap();
-            Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap()
+            writer.finish(&timeline, &ctx).await.unwrap()
        };
        let original_size = resident.metadata().file_size;

@@ -1205,9 +1228,7 @@ mod test {
                .await
                .unwrap();
            let replacement = if wrote_keys > 0 {
-                let (desc, path) = filtered_writer.finish(&ctx).await.unwrap();
-                let resident = Layer::finish_creating(tenant.conf, &timeline, desc, &path).unwrap();
-                Some(resident)
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
            } else {
                None
            };
@@ -1280,8 +1301,7 @@ mod test {
        for (key, img) in images {
            writer.put_image(key, img, ctx).await?;
        }
-        let (desc, path) = writer.finish(ctx).await?;
-        let img_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
+        let img_layer = writer.finish(tline, ctx).await?;

        Ok::<_, anyhow::Error>(img_layer)
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,10 +10,9 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Context, Result};
+use anyhow::{Context, Result};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
@@ -35,9 +34,7 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
 use tokio::sync::RwLock;

-use super::{
-    DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
-};
+use super::{DeltaLayerWriter, PersistentLayerDesc, ValuesReconstructState};

 pub(crate) mod vectored_dio_read;

@@ -87,7 +84,7 @@ pub struct InMemoryLayerInner {
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
-    file: EphemeralFile,
+    file: Arc<tokio::sync::RwLock<EphemeralFile>>,

    resource_units: GlobalResourceUnits,
 }
@@ -381,7 +378,11 @@ impl InMemoryLayer {
    }

    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner.try_read().map(|i| i.file.len()).ok()
+        self.inner
+            .try_read()
+            .map(|i| i.file.try_read().map(|i| i.len()).ok())
+            .ok()
+            .flatten()
    }

    pub(crate) fn assert_writable(&self) {
@@ -432,6 +433,10 @@ impl InMemoryLayer {
            read: vectored_dio_read::LogicalRead<Vec<u8>>,
        }
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
+        let mut senders: HashMap<
+            (Key, Lsn),
+            tokio::sync::oneshot::Sender<Result<Bytes, std::io::Error>>,
+        > = Default::default();

        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
@@ -459,6 +464,11 @@ impl InMemoryLayer {
                            Vec::with_capacity(len as usize),
                        ),
                    });
+
+                    let (tx, rx) = tokio::sync::oneshot::channel();
+                    senders.insert((key, *entry_lsn), tx);
+                    reconstruct_state.update_key(&key, *entry_lsn, will_init, rx);
+
                    if will_init {
                        break;
                    }
@@ -466,46 +476,42 @@ impl InMemoryLayer {
            }
        }

-        // Execute the reads.
+        let read_from = inner.file.clone();
+        let read_ctx = ctx.attached_child();
+        tokio::task::spawn(async move {
+            let locked = read_from.read().await;
+            let f = vectored_dio_read::execute(
+                &*locked,
+                reads
+                    .iter()
+                    .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+                &read_ctx,
+            );
+            send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
+                .await;

-        let f = vectored_dio_read::execute(
-            &inner.file,
-            reads
-                .iter()
-                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
-            &ctx,
-        );
-        send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
-            .await;
-
-        // Process results into the reconstruct state
-        'next_key: for (key, value_reads) in reads {
-            for ValueRead { entry_lsn, read } in value_reads {
-                match read.into_result().expect("we run execute() above") {
-                    Err(e) => {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        continue 'next_key;
-                    }
-                    Ok(value_buf) => {
-                        let value = Value::des(&value_buf);
-                        if let Err(e) = value {
-                            reconstruct_state
-                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                            continue 'next_key;
+            for (key, value_reads) in reads {
+                for ValueRead { entry_lsn, read } in value_reads {
+                    let sender = senders
+                        .remove(&(key, entry_lsn))
+                        .expect("sender must exist");
+                    match read.into_result().expect("we run execute() above") {
+                        Err(e) => {
+                            let sender = senders
+                                .remove(&(key, entry_lsn))
+                                .expect("sender must exist");
+                            let _ = sender
+                                .send(Err(std::io::Error::new(e.kind(), "dio vec read failed")));
                        }
-
-                        let key_situation =
-                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
-                        if key_situation == ValueReconstructSituation::Complete {
-                            // TODO: metric to see if we fetched more values than necessary
-                            continue 'next_key;
+                        Ok(value_buf) => {
+                            let _ = sender.send(Ok(value_buf.into()));
                        }
-
-                        // process the next value in the next iteration of the loop
                    }
                }
            }
-        }
+
+            assert!(senders.is_empty());
+        });

        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);

@@ -600,7 +606,8 @@ impl InMemoryLayer {
    /// Get layer size.
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        Ok(inner.file.len())
+        let locked = inner.file.try_read().expect("no contention");
+        Ok(locked.len())
    }

    /// Create a new, empty, in-memory layer
@@ -614,9 +621,10 @@ impl InMemoryLayer {
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
-        let key = InMemoryLayerFileId(file.page_cache_file_id());
+        let file = Arc::new(tokio::sync::RwLock::new(
+            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?,
+        ));
+        let key = InMemoryLayerFileId(file.read().await.page_cache_file_id());

        Ok(InMemoryLayer {
            file_id: key,
@@ -648,7 +656,7 @@ impl InMemoryLayer {
        let mut inner = self.inner.write().await;
        self.assert_writable();

-        let base_offset = inner.file.len();
+        let base_offset = inner.file.read().await.len();

        let SerializedBatch {
            raw,
@@ -672,8 +680,13 @@ impl InMemoryLayer {
        }

        // Write the batch to the file
-        inner.file.write_raw(&raw, ctx).await?;
-        let new_size = inner.file.len();
+        // FIXME: can't borrow arc
+        let new_size = {
+            let mut locked = inner.file.write().await;
+            locked.write_raw(&raw, ctx).await?;
+            locked.len()
+        };
+
        let expected_new_len = base_offset
            .checked_add(raw.len().into_u64())
            // write_raw would error if we were to overflow u64.
@@ -713,7 +726,7 @@ impl InMemoryLayer {

    pub(crate) async fn tick(&self) -> Option<u64> {
        let mut inner = self.inner.write().await;
-        let size = inner.file.len();
+        let size = inner.file.read().await.len();
        inner.resource_units.publish_size(size)
    }

@@ -809,7 +822,7 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+                let file_contents: Vec<u8> = inner.file.read().await.load_to_vec(ctx).await?;

                let file_contents = Bytes::from(file_contents);

--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -107,6 +107,8 @@ async fn smoke_test() {
            .expect("tenant harness writes the control file")
    };

+    let img_before = (img_before.0, img_before.1.await.unwrap().unwrap());
+    let img_after = (img_after.0, img_after.1.await.unwrap().unwrap());
    assert_eq!(img_before, img_after);

    // evict_and_wait can timeout, but it doesn't cancel the evicting itself
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -121,11 +121,11 @@ impl SplitImageLayerWriter {
                self.generated_layers
                    .push(SplitWriterResult::Discarded(layer_key));
            } else {
-                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
-
-                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-                self.generated_layers
-                    .push(SplitWriterResult::Produced(layer));
+                self.generated_layers.push(SplitWriterResult::Produced(
+                    prev_image_writer
+                        .finish_with_end_key(tline, key, ctx)
+                        .await?,
+                ));
            }
        }
        self.inner.put_image(key, img, ctx).await
@@ -170,9 +170,9 @@ impl SplitImageLayerWriter {
        if discard(&layer_key).await {
            generated_layers.push(SplitWriterResult::Discarded(layer_key));
        } else {
-            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
-            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            generated_layers.push(SplitWriterResult::Produced(layer));
+            generated_layers.push(SplitWriterResult::Produced(
+                inner.finish_with_end_key(tline, end_key, ctx).await?,
+            ));
        }
        Ok(generated_layers)
    }
@@ -188,7 +188,7 @@ impl SplitImageLayerWriter {
            .await
    }

-    /// This function will be deprecated with #8841.
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
        Ok((self.generated_layers, self.inner))
    }
@@ -204,7 +204,7 @@ impl SplitImageLayerWriter {
 /// will split them into multiple files based on size.
 #[must_use]
 pub struct SplitDeltaLayerWriter {
-    inner: Option<(Key, DeltaLayerWriter)>,
+    inner: DeltaLayerWriter,
    target_layer_size: u64,
    generated_layers: Vec<SplitWriterResult>,
    conf: &'static PageServerConf,
@@ -212,6 +212,7 @@ pub struct SplitDeltaLayerWriter {
    tenant_shard_id: TenantShardId,
    lsn_range: Range<Lsn>,
    last_key_written: Key,
+    start_key: Key,
 }

 impl SplitDeltaLayerWriter {
@@ -219,18 +220,29 @@ impl SplitDeltaLayerWriter {
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
+        start_key: Key,
        lsn_range: Range<Lsn>,
        target_layer_size: u64,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
-            inner: None,
+            inner: DeltaLayerWriter::new(
+                conf,
+                timeline_id,
+                tenant_shard_id,
+                start_key,
+                lsn_range.clone(),
+                ctx,
+            )
+            .await?,
            generated_layers: Vec::new(),
            conf,
            timeline_id,
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
+            start_key,
        })
    }

@@ -253,26 +265,9 @@ impl SplitDeltaLayerWriter {
        //
        // Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
        // strategy. https://github.com/neondatabase/neon/issues/8837
-
-        if self.inner.is_none() {
-            self.inner = Some((
-                key,
-                DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    key,
-                    self.lsn_range.clone(),
-                    ctx,
-                )
-                .await?,
-            ));
-        }
-        let (_, inner) = self.inner.as_mut().unwrap();
-
        let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
-        if inner.num_keys() >= 1
-            && inner.estimated_size() + addition_size_estimation >= self.target_layer_size
+        if self.inner.num_keys() >= 1
+            && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
        {
            if key != self.last_key_written {
                let next_delta_writer = DeltaLayerWriter::new(
@@ -284,13 +279,13 @@ impl SplitDeltaLayerWriter {
                    ctx,
                )
                .await?;
-                let (start_key, prev_delta_writer) =
-                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
+                let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
                let layer_key = PersistentLayerKey {
-                    key_range: start_key..key,
+                    key_range: self.start_key..key,
                    lsn_range: self.lsn_range.clone(),
                    is_delta: true,
                };
+                self.start_key = key;
                if discard(&layer_key).await {
                    drop(prev_delta_writer);
                    self.generated_layers
@@ -301,18 +296,17 @@ impl SplitDeltaLayerWriter {
                    self.generated_layers
                        .push(SplitWriterResult::Produced(delta_layer));
                }
-            } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
+            } else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
                // We have to produce a very large file b/c a key is updated too often.
                anyhow::bail!(
                    "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
                    key,
-                    inner.estimated_size()
+                    self.inner.estimated_size()
                );
            }
        }
        self.last_key_written = key;
-        let (_, inner) = self.inner.as_mut().unwrap();
-        inner.put_value(key, lsn, val, ctx).await
+        self.inner.put_value(key, lsn, val, ctx).await
    }

    pub async fn put_value(
@@ -331,6 +325,7 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
+        end_key: Key,
        discard: D,
    ) -> anyhow::Result<Vec<SplitWriterResult>>
    where
@@ -342,15 +337,11 @@ impl SplitDeltaLayerWriter {
            inner,
            ..
        } = self;
-        let Some((start_key, inner)) = inner else {
-            return Ok(generated_layers);
-        };
        if inner.num_keys() == 0 {
            return Ok(generated_layers);
        }
-        let end_key = self.last_key_written.next();
        let layer_key = PersistentLayerKey {
-            key_range: start_key..end_key,
+            key_range: self.start_key..end_key,
            lsn_range: self.lsn_range.clone(),
            is_delta: true,
        };
@@ -369,14 +360,15 @@ impl SplitDeltaLayerWriter {
        self,
        tline: &Arc<Timeline>,
        ctx: &RequestContext,
+        end_key: Key,
    ) -> anyhow::Result<Vec<SplitWriterResult>> {
-        self.finish_with_discard_fn(tline, ctx, |_| async { false })
+        self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
            .await
    }

-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
-        Ok((self.generated_layers, self.inner.map(|x| x.1)))
+    /// When split writer fails, the caller should call this function and handle partially generated layers.
+    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
+        Ok((self.generated_layers, self.inner))
    }
 }

@@ -440,8 +432,10 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
+            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &ctx,
        )
        .await
        .unwrap();
@@ -466,22 +460,11 @@ mod tests {
            )
            .await
            .unwrap();
-        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
        assert_eq!(layers.len(), 1);
-        assert_eq!(
-            layers
-                .into_iter()
-                .next()
-                .unwrap()
-                .into_resident_layer()
-                .layer_desc()
-                .key(),
-            PersistentLayerKey {
-                key_range: get_key(0)..get_key(1),
-                lsn_range: Lsn(0x18)..Lsn(0x20),
-                is_delta: true
-            }
-        );
    }

    #[tokio::test]
@@ -518,8 +501,10 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
+            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &ctx,
        )
        .await
        .unwrap();
@@ -548,7 +533,10 @@ mod tests {
            .finish(&tline, &ctx, get_key(N as u32))
            .await
            .unwrap();
-        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
        if discard {
            for layer in image_layers {
                layer.into_discarded_layer();
@@ -567,14 +555,6 @@ mod tests {
                .collect_vec();
            assert_eq!(image_layers.len(), N / 512 + 1);
            assert_eq!(delta_layers.len(), N / 512 + 1);
-            assert_eq!(
-                delta_layers.first().unwrap().layer_desc().key_range.start,
-                get_key(0)
-            );
-            assert_eq!(
-                delta_layers.last().unwrap().layer_desc().key_range.end,
-                get_key(N as u32)
-            );
            for idx in 0..image_layers.len() {
                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
@@ -622,8 +602,10 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
+            get_key(0),
            Lsn(0x18)..Lsn(0x20),
            4 * 1024,
+            &ctx,
        )
        .await
        .unwrap();
@@ -662,35 +644,11 @@ mod tests {
            )
            .await
            .unwrap();
-        let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
+        let layers = delta_writer
+            .finish(&tline, &ctx, get_key(10))
+            .await
+            .unwrap();
        assert_eq!(layers.len(), 2);
-        let mut layers_iter = layers.into_iter();
-        assert_eq!(
-            layers_iter
-                .next()
-                .unwrap()
-                .into_resident_layer()
-                .layer_desc()
-                .key(),
-            PersistentLayerKey {
-                key_range: get_key(0)..get_key(1),
-                lsn_range: Lsn(0x18)..Lsn(0x20),
-                is_delta: true
-            }
-        );
-        assert_eq!(
-            layers_iter
-                .next()
-                .unwrap()
-                .into_resident_layer()
-                .layer_desc()
-                .key(),
-            PersistentLayerKey {
-                key_range: get_key(1)..get_key(2),
-                lsn_range: Lsn(0x18)..Lsn(0x20),
-                is_delta: true
-            }
-        );
    }

    #[tokio::test]
@@ -710,8 +668,10 @@ mod tests {
            tenant.conf,
            tline.timeline_id,
            tenant.tenant_shard_id,
+            get_key(0),
            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
            4 * 1024 * 1024,
+            &ctx,
        )
        .await
        .unwrap();
@@ -729,20 +689,10 @@ mod tests {
                .await
                .unwrap();
        }
-        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
+        let delta_layers = delta_writer
+            .finish(&tline, &ctx, get_key(N as u32))
+            .await
+            .unwrap();
        assert_eq!(delta_layers.len(), 1);
-        let delta_layer = delta_layers
-            .into_iter()
-            .next()
-            .unwrap()
-            .into_resident_layer();
-        assert_eq!(
-            delta_layer.layer_desc().key(),
-            PersistentLayerKey {
-                key_range: get_key(0)..get_key(1),
-                lsn_range: Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
-                is_delta: true
-            }
-        );
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
@@ -68,7 +69,9 @@ use crate::{
    tenant::{
        layer_map::{LayerMap, SearchResult},
        metadata::TimelineMetadata,
-        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
+        storage_layer::{
+            convert, inmemory_layer::IndexEntry, PersistentLayerDesc, ValueReconstructSituation,
+        },
    },
    walredo,
 };
@@ -1129,22 +1132,38 @@ impl Timeline {
        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
            .for_get_kind(get_kind)
            .start_timer();
-        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
        let layers_visited = reconstruct_state.get_layers_visited();

+        let futs = FuturesUnordered::new();
        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
-            match res {
-                Err(err) => {
-                    results.insert(key, Err(err));
-                }
-                Ok(state) => {
-                    let state = ValueReconstructState::from(state);
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                async move {
+                    let state = res.expect("Read path is infallible");
+                    assert!(matches!(
+                        state.situation,
+                        ValueReconstructSituation::Complete
+                    ));

-                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
-                    results.insert(key, reconstruct_res);
+                    let converted = match convert(key, state).await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+
+                    (
+                        key,
+                        walredo_self.reconstruct_value(key, lsn, converted).await,
+                    )
                }
-            }
+            });
        }
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
+
        reconstruct_timer.stop_and_record();

        // For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -4013,8 +4032,7 @@ impl Timeline {
        if wrote_keys {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -4102,8 +4120,7 @@ impl Timeline {
        if wrote_any_image {
            // Normal path: we have written some data into the new image layer for this
            // partition, so flush it to disk.
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
-            let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
            Ok(ImageLayerCreationOutcome {
                image: Some(image_layer),
                next_start_key: img_range.end,
@@ -5405,8 +5422,7 @@ impl Timeline {
        for (key, img) in images {
            image_layer_writer.put_image(key, img, ctx).await?;
        }
-        let (desc, path) = image_layer_writer.finish(ctx).await?;
-        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+        let image_layer = image_layer_writer.finish(self, ctx).await?;

        {
            let mut guard = self.layers.write().await;
@@ -5499,30 +5515,30 @@ impl Timeline {
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
        self: &Arc<Timeline>,
-        lsn: Lsn,
-        ctx: &RequestContext,
+        _lsn: Lsn,
+        _ctx: &RequestContext,
    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
-        let mut all_data = Vec::new();
-        let guard = self.layers.read().await;
-        for layer in guard.layer_map()?.iter_historic_layers() {
-            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
-                let layer = guard.get_from_desc(&layer);
-                let mut reconstruct_data = ValuesReconstructState::default();
-                layer
-                    .get_values_reconstruct_data(
-                        KeySpace::single(Key::MIN..Key::MAX),
-                        lsn..Lsn(lsn.0 + 1),
-                        &mut reconstruct_data,
-                        ctx,
-                    )
-                    .await?;
-                for (k, v) in reconstruct_data.keys {
-                    all_data.push((k, v?.img.unwrap().1));
-                }
-            }
-        }
-        all_data.sort();
-        Ok(all_data)
+        // let mut all_data = Vec::new();
+        // let guard = self.layers.read().await;
+        // for layer in guard.layer_map()?.iter_historic_layers() {
+        //     if !layer.is_delta() && layer.image_layer_lsn() == lsn {
+        //         let layer = guard.get_from_desc(&layer);
+        //         let mut reconstruct_data = ValuesReconstructState::default();
+        //         layer
+        //             .get_values_reconstruct_data(
+        //                 KeySpace::single(Key::MIN..Key::MAX),
+        //                 lsn..Lsn(lsn.0 + 1),
+        //                 &mut reconstruct_data,
+        //                 ctx,
+        //             )
+        //             .await?;
+        //         for (k, v) in reconstruct_data.keys {
+        //             all_data.push((k, v?.img.unwrap().1));
+        //         }
+        //     }
+        // }
+        // all_data.sort();
+        Ok(Vec::new())
    }

    /// Get all historic layer descriptors in the layer map
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -563,12 +563,10 @@ impl Timeline {
                .await?;

            if keys_written > 0 {
-                let (desc, path) = image_layer_writer
-                    .finish(ctx)
+                let new_layer = image_layer_writer
+                    .finish(self, ctx)
                    .await
                    .map_err(CompactionError::Other)?;
-                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
-                    .map_err(CompactionError::Other)?;
                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);
@@ -1811,6 +1809,7 @@ impl Timeline {
            .unwrap();
        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
        // as an L0 layer.
+        let hack_end_key = Key::NON_L0_MAX;
        let mut delta_layers = Vec::new();
        let mut image_layers = Vec::new();
        let mut downloaded_layers = Vec::new();
@@ -1856,8 +1855,10 @@ impl Timeline {
            self.conf,
            self.timeline_id,
            self.tenant_shard_id,
+            Key::MIN,
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
+            ctx,
        )
        .await?;

@@ -1964,7 +1965,7 @@ impl Timeline {
        let produced_image_layers = if let Some(writer) = image_layer_writer {
            if !dry_run {
                writer
-                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
+                    .finish_with_discard_fn(self, ctx, hack_end_key, discard)
                    .await?
            } else {
                let (layers, _) = writer.take()?;
@@ -1977,7 +1978,7 @@ impl Timeline {

        let produced_delta_layers = if !dry_run {
            delta_layer_writer
-                .finish_with_discard_fn(self, ctx, discard)
+                .finish_with_discard_fn(self, ctx, hack_end_key, discard)
                .await?
        } else {
            let (layers, _) = delta_layer_writer.take()?;
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -33,6 +33,7 @@ use crate::virtual_file::{self, VirtualFile};
 pub struct BlobMeta {
    pub key: Key,
    pub lsn: Lsn,
+    pub will_init: bool,
 }

 /// Blob offsets into [`VectoredBlobsBuf::buf`]
@@ -355,7 +356,8 @@ pub enum BlobFlag {
 /// * Iterate over the collected blobs and coalesce them into reads at the end
 pub struct VectoredReadPlanner {
    // Track all the blob offsets. Start offsets must be ordered.
-    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Note: last bool is will_init
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64, bool)>>,
    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

@@ -420,12 +422,12 @@ impl VectoredReadPlanner {
        match flag {
            BlobFlag::None => {
                let blobs_for_key = self.blobs.entry(key).or_default();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, false));
            }
            BlobFlag::ReplaceAll => {
                let blobs_for_key = self.blobs.entry(key).or_default();
                blobs_for_key.clear();
-                blobs_for_key.push((lsn, start_offset, end_offset));
+                blobs_for_key.push((lsn, start_offset, end_offset, true));
            }
            BlobFlag::Ignore => {}
        }
@@ -436,11 +438,17 @@ impl VectoredReadPlanner {
        let mut reads = Vec::new();

        for (key, blobs_for_key) in self.blobs {
-            for (lsn, start_offset, end_offset) in blobs_for_key {
+            for (lsn, start_offset, end_offset, will_init) in blobs_for_key {
                let extended = match &mut current_read_builder {
-                    Some(read_builder) => {
-                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
-                    }
+                    Some(read_builder) => read_builder.extend(
+                        start_offset,
+                        end_offset,
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
+                    ),
                    None => VectoredReadExtended::No,
                };

@@ -448,7 +456,11 @@ impl VectoredReadPlanner {
                    let next_read_builder = VectoredReadBuilder::new(
                        start_offset,
                        end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init,
+                        },
                        self.max_read_size,
                        self.mode,
                    );
@@ -665,10 +677,19 @@ impl StreamingVectoredReadPlanner {
        start_offset: u64,
        end_offset: u64,
        is_last_blob_in_read: bool,
+        // destination: oneshot::Sender<Result<Bytes, std::io::Error>>,
    ) -> Option<VectoredRead> {
        match &mut self.read_builder {
            Some(read_builder) => {
-                let extended = read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn });
+                let extended = read_builder.extend(
+                    start_offset,
+                    end_offset,
+                    BlobMeta {
+                        key,
+                        lsn,
+                        will_init: false,
+                    },
+                );
                assert_eq!(extended, VectoredReadExtended::Yes);
            }
            None => {
@@ -676,7 +697,11 @@ impl StreamingVectoredReadPlanner {
                    Some(VectoredReadBuilder::new_streaming(
                        start_offset,
                        end_offset,
-                        BlobMeta { key, lsn },
+                        BlobMeta {
+                            key,
+                            lsn,
+                            will_init: false,
+                        },
                        self.mode,
                    ))
                };
@@ -1008,6 +1033,7 @@ mod tests {
        let meta = BlobMeta {
            key: Key::MIN,
            lsn: Lsn(0),
+            will_init: false,
        };

        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,9 @@ use std::time::Duration;
 use std::time::SystemTime;

 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
+use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
+use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
+use postgres_ffi::TimestampTz;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

 use anyhow::{bail, Context, Result};
@@ -46,31 +48,16 @@ use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
+use postgres_ffi::v14::xlog_utils::*;
+use postgres_ffi::v14::CheckPoint;
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
-use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

-enum_pgversion! {CheckPoint, pgv::CheckPoint}
-
-impl CheckPoint {
-    fn encode(&self) -> Result<Bytes, SerializeError> {
-        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.encode() })
-    }
-
-    fn update_next_xid(&mut self, xid: u32) -> bool {
-        enum_pgversion_dispatch!(self, CheckPoint, cp, { cp.update_next_xid(xid) })
-    }
-
-    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
-        enum_pgversion_dispatch!(self, CheckPoint, cp, {
-            cp.update_next_multixid(multi_xid, multi_offset)
-        })
-    }
-}
-
 pub struct WalIngest {
    shard: ShardIdentity,
+    pg_version: u32,
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
    warn_ingest_lag: WarnIngestLag,
@@ -91,16 +78,12 @@ impl WalIngest {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
-        let pgversion = timeline.pg_version;
-
-        let checkpoint = dispatch_pgversion!(pgversion, {
-            let checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
-            trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
-            <pgv::CheckPoint as Into<CheckPoint>>::into(checkpoint)
-        });
+        let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);

        Ok(WalIngest {
            shard: *timeline.get_shard_identity(),
+            pg_version: timeline.pg_version,
            checkpoint,
            checkpoint_modified: false,
            warn_ingest_lag: WarnIngestLag {
@@ -134,7 +117,7 @@ impl WalIngest {

        modification.set_lsn(lsn)?;

-        if decoded.is_dbase_create_copy(pg_version) {
+        if decoded.is_dbase_create_copy(self.pg_version) {
            // Records of this type should always be preceded by a commit(), as they
            // rely on reading data pages back from the Timeline.
            assert!(!modification.has_dirty_data_pages());
@@ -237,26 +220,6 @@ impl WalIngest {
                                .await?;
                        }
                    }
-                } else if pg_version == 17 {
-                    if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                        // The XLOG record was renamed between v14 and v15,
-                        // but the record format is the same.
-                        // So we can reuse XlCreateDatabase here.
-                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                        let createdb = XlCreateDatabase::decode(&mut buf);
-                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                            .await?;
-                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                        let dropdb = XlDropDatabase::decode(&mut buf);
-                        for tablespace_id in dropdb.tablespace_ids {
-                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                            modification
-                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                                .await?;
-                        }
-                    }
                }
            }
            pg_constants::RM_TBLSPC_ID => {
@@ -266,11 +229,7 @@ impl WalIngest {
                let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;

                if info == pg_constants::CLOG_ZEROPAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
+                    let pageno = buf.get_u32_le();
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -284,7 +243,7 @@ impl WalIngest {
                    .await?;
                } else {
                    assert!(info == pg_constants::CLOG_TRUNCATE);
-                    let xlrec = XlClogTruncate::decode(&mut buf, pg_version);
+                    let xlrec = XlClogTruncate::decode(&mut buf);
                    self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                        .await?;
                }
@@ -323,21 +282,12 @@ impl WalIngest {
                        parsed_xact.xid,
                        lsn,
                    );
-
-                    let xid: u64 = if pg_version >= 17 {
-                        self.adjust_to_full_transaction_id(parsed_xact.xid)?
-                    } else {
-                        parsed_xact.xid as u64
-                    };
-                    modification.drop_twophase_file(xid, ctx).await?;
-                } else if info == pg_constants::XLOG_XACT_PREPARE {
-                    let xid: u64 = if pg_version >= 17 {
-                        self.adjust_to_full_transaction_id(decoded.xl_xid)?
-                    } else {
-                        decoded.xl_xid as u64
-                    };
                    modification
-                        .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                        .drop_twophase_file(parsed_xact.xid, ctx)
+                        .await?;
+                } else if info == pg_constants::XLOG_XACT_PREPARE {
+                    modification
+                        .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
                        .await?;
                }
            }
@@ -345,11 +295,7 @@ impl WalIngest {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

                if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
+                    let pageno = buf.get_u32_le();
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -362,11 +308,7 @@ impl WalIngest {
                    )
                    .await?;
                } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
+                    let pageno = buf.get_u32_le();
                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                    self.put_slru_page_image(
@@ -395,93 +337,70 @@ impl WalIngest {
            pg_constants::RM_XLOG_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

-                if info == pg_constants::XLOG_PARAMETER_CHANGE {
-                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
-                        let rec = v17::XlParameterChange::decode(&mut buf);
-                        cp.wal_level = rec.wal_level;
+                if info == pg_constants::XLOG_NEXTOID {
+                    let next_oid = buf.get_u32_le();
+                    if self.checkpoint.nextOid != next_oid {
+                        self.checkpoint.nextOid = next_oid;
                        self.checkpoint_modified = true;
                    }
-                } else if info == pg_constants::XLOG_END_OF_RECOVERY {
-                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
-                        let rec = v17::XlEndOfRecovery::decode(&mut buf);
-                        cp.wal_level = rec.wal_level;
-                        self.checkpoint_modified = true;
-                    }
-                }
-
-                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-                    if info == pg_constants::XLOG_NEXTOID {
-                        let next_oid = buf.get_u32_le();
-                        if cp.nextOid != next_oid {
-                            cp.nextOid = next_oid;
-                            self.checkpoint_modified = true;
-                        }
-                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                {
+                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
+                    buf.copy_to_slice(&mut checkpoint_bytes);
+                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+                    trace!(
+                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                        xlog_checkpoint.oldestXid,
+                        self.checkpoint.oldestXid
+                    );
+                    if (self
+                        .checkpoint
+                        .oldestXid
+                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
+                        < 0
                    {
-                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
-                        buf.copy_to_slice(&mut checkpoint_bytes);
-                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
-                        trace!(
-                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                            xlog_checkpoint.oldestXid,
-                            cp.oldestXid
-                        );
-                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
-                            cp.oldestXid = xlog_checkpoint.oldestXid;
-                        }
-                        trace!(
-                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                            xlog_checkpoint.oldestActiveXid,
-                            cp.oldestActiveXid
-                        );
-
-                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                        // because at shutdown, all in-progress transactions will implicitly
-                        // end. Postgres startup code knows that, and allows hot standby to start
-                        // immediately from a shutdown checkpoint.
-                        //
-                        // In Neon, Postgres hot standby startup always behaves as if starting from
-                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                        // instead of overwriting self.checkpoint.oldestActiveXid with
-                        // InvalidTransactionid from the checkpoint WAL record, update it to a
-                        // proper value, knowing that there are no in-progress transactions at this
-                        // point, except for prepared transactions.
-                        //
-                        // See also the neon code changes in the InitWalRecovery() function.
-                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                        {
-                            let oldest_active_xid = if pg_version >= 17 {
-                                let mut oldest_active_full_xid = cp.nextXid.value;
-                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                    if xid < oldest_active_full_xid {
-                                        oldest_active_full_xid = xid;
-                                    }
-                                }
-                                oldest_active_full_xid as u32
-                            } else {
-                                let mut oldest_active_xid = cp.nextXid.value as u32;
-                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                    let narrow_xid = xid as u32;
-                                    if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                        oldest_active_xid = narrow_xid;
-                                    }
-                                }
-                                oldest_active_xid
-                            };
-                            cp.oldestActiveXid = oldest_active_xid;
-                        } else {
-                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                        }
-
-                        // Write a new checkpoint key-value pair on every checkpoint record, even
-                        // if nothing really changed. Not strictly required, but it seems nice to
-                        // have some trace of the checkpoint records in the layer files at the same
-                        // LSNs.
-                        self.checkpoint_modified = true;
+                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
                    }
-                });
+                    trace!(
+                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                        xlog_checkpoint.oldestActiveXid,
+                        self.checkpoint.oldestActiveXid
+                    );
+
+                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                    // because at shutdown, all in-progress transactions will implicitly
+                    // end. Postgres startup code knows that, and allows hot standby to start
+                    // immediately from a shutdown checkpoint.
+                    //
+                    // In Neon, Postgres hot standby startup always behaves as if starting from
+                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                    // instead of overwriting self.checkpoint.oldestActiveXid with
+                    // InvalidTransactionid from the checkpoint WAL record, update it to a
+                    // proper value, knowing that there are no in-progress transactions at this
+                    // point, except for prepared transactions.
+                    //
+                    // See also the neon code changes in the InitWalRecovery() function.
+                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                oldest_active_xid = xid;
+                            }
+                        }
+                        self.checkpoint.oldestActiveXid = oldest_active_xid;
+                    } else {
+                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                    }
+
+                    // Write a new checkpoint key-value pair on every checkpoint record, even
+                    // if nothing really changed. Not strictly required, but it seems nice to
+                    // have some trace of the checkpoint records in the layer files at the same
+                    // LSNs.
+                    self.checkpoint_modified = true;
+                }
            }
            pg_constants::RM_LOGICALMSG_ID => {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -505,11 +424,7 @@ impl WalIngest {
                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
                if info == pg_constants::XLOG_RUNNING_XACTS {
                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-
-                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-                        cp.oldestActiveXid = xlrec.oldest_running_xid;
-                    });
-
+                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
                    self.checkpoint_modified = true;
                }
            }
@@ -582,25 +497,6 @@ impl WalIngest {
        Ok(modification.len() > prev_len)
    }

-    /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
-        let next_full_xid =
-            enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
-
-        let next_xid = (next_full_xid) as u32;
-        let mut epoch = (next_full_xid >> 32) as u32;
-
-        if xid > next_xid {
-            // Wraparound occurred, must be from a prev epoch.
-            if epoch == 0 {
-                bail!("apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}");
-            }
-            epoch -= 1;
-        }
-
-        Ok((epoch as u64) << 32 | xid as u64)
-    }
-
    /// Do not store this block, but observe it for the purposes of updating our relation size state.
    async fn observe_decoded_block(
        &mut self,
@@ -643,7 +539,7 @@ impl WalIngest {
            && blk.has_image
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
            && (decoded.xl_info == pg_constants::XLOG_FPI
-            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
            // compression of WAL is not yet supported: fall back to storing the original WAL record
            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
            // do not materialize null pages because them most likely be soon replaced with real data
@@ -901,73 +797,6 @@ impl WalIngest {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            17 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v17::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v17::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v17::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v17::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
            _ => {}
        }

@@ -1076,26 +905,26 @@ impl WalIngest {
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match pg_version {
-            16 | 17 => {
+            16 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

                match info {
                    pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
+                        let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf);
                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
+                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_UPDATE
                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf);
                        // the size of tuple data is inferred from the size of the record.
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
@@ -1111,7 +940,7 @@ impl WalIngest {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+                        let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf);

                        let offset_array_len =
                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
@@ -1127,7 +956,7 @@ impl WalIngest {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
+                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
@@ -1375,7 +1204,7 @@ impl WalIngest {
            if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                // Tail of last remaining FSM page has to be zeroed.
                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
+                modification.put_rel_page_image_zero(rel, fsm_physical_page_no);
                fsm_physical_page_no += 1;
            }
            let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1397,7 +1226,7 @@ impl WalIngest {
            if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                // Tail of last remaining vm page has to be zeroed.
                // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no)?;
+                modification.put_rel_page_image_zero(rel, vm_page_no);
                vm_page_no += 1;
            }
            let nblocks = get_relsize(modification, rel, ctx).await?;
@@ -1413,17 +1242,12 @@ impl WalIngest {
    fn warn_on_ingest_lag(
        &mut self,
        conf: &crate::config::PageServerConf,
-        wal_timestamp: TimestampTz,
+        wal_timestmap: TimestampTz,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
        let now = SystemTime::now();
        let rate_limits = &mut self.warn_ingest_lag;
-
-        let ts = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, _cp, {
-            pgv::xlog_utils::try_from_pg_timestamp(wal_timestamp)
-        });
-
-        match ts {
+        match try_from_pg_timestamp(wal_timestmap) {
            Ok(ts) => {
                match now.duration_since(ts) {
                    Ok(lag) => {
@@ -1433,7 +1257,7 @@ impl WalIngest {
                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                            })
                        }
-                    }
+                    },
                    Err(e) => {
                        let delta_t = e.duration();
                        // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds)
@@ -1447,6 +1271,7 @@ impl WalIngest {
                        }
                    }
                };
+
            }
            Err(error) => {
                rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| {
@@ -1554,17 +1379,14 @@ impl WalIngest {
        // truncated, but a checkpoint record with the updated values isn't written until
        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
        // so we keep the oldestXid and oldestXidDB up-to-date.
-        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-            cp.oldestXid = xlrec.oldest_xid;
-            cp.oldestXidDB = xlrec.oldest_xid_db;
-        });
+        self.checkpoint.oldestXid = xlrec.oldest_xid;
+        self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
        self.checkpoint_modified = true;

        // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it

        let latest_page_number =
-            enum_pgversion_dispatch!(self.checkpoint, CheckPoint, cp, { cp.nextXid.value }) as u32
-                / pg_constants::CLOG_XACTS_PER_PAGE;
+            self.checkpoint.nextXid.value as u32 / pg_constants::CLOG_XACTS_PER_PAGE;

        // Now delete all segments containing pages between xlrec.pageno
        // and latest_page_number.
@@ -1572,9 +1394,7 @@ impl WalIngest {
        // First, make an important safety check:
        // the current endpoint page must not be eligible for removal.
        // See SimpleLruTruncate() in slru.c
-        if dispatch_pgversion!(modification.tline.pg_version, {
-            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
-        }) {
+        if clogpage_precedes(latest_page_number, xlrec.pageno) {
            info!("could not truncate directory pg_xact apparent wraparound");
            return Ok(());
        }
@@ -1591,12 +1411,7 @@ impl WalIngest {
            .await?
        {
            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
-            });
-
-            if may_delete {
+            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                modification
                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                    .await?;
@@ -1715,23 +1530,14 @@ impl WalIngest {
        xlrec: &XlMultiXactTruncate,
        ctx: &RequestContext,
    ) -> Result<()> {
-        let (maxsegment, startsegment, endsegment) =
-            enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-                cp.oldestMulti = xlrec.end_trunc_off;
-                cp.oldestMultiDB = xlrec.oldest_multi_db;
-                let maxsegment: i32 = pgv::nonrelfile_utils::mx_offset_to_member_segment(
-                    pg_constants::MAX_MULTIXACT_OFFSET,
-                );
-                let startsegment: i32 =
-                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.start_trunc_memb);
-                let endsegment: i32 =
-                    pgv::nonrelfile_utils::mx_offset_to_member_segment(xlrec.end_trunc_memb);
-                (maxsegment, startsegment, endsegment)
-            });
-
+        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
+        self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
        self.checkpoint_modified = true;

        // PerformMembersTruncation
+        let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
+        let startsegment: i32 = mx_offset_to_member_segment(xlrec.start_trunc_memb);
+        let endsegment: i32 = mx_offset_to_member_segment(xlrec.end_trunc_memb);
        let mut segment: i32 = startsegment;

        // Delete all the segments except the last one. The last segment can still
@@ -1890,7 +1696,7 @@ impl WalIngest {
                    continue;
                }

-                modification.put_rel_page_image_zero(rel, gap_blknum)?;
+                modification.put_rel_page_image_zero(rel, gap_blknum);
            }
        }
        Ok(())
@@ -1956,7 +1762,7 @@ impl WalIngest {

            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
-                modification.put_slru_page_image_zero(kind, segno, gap_blknum)?;
+                modification.put_slru_page_image_zero(kind, segno, gap_blknum);
            }
        }
        Ok(())
@@ -2005,23 +1811,11 @@ mod tests {
        // TODO
    }

-    #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
-        for i in 14..=16 {
-            dispatch_pgversion!(i, {
-                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
-            });
-        }
-
-        Ok(())
-    }
+    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
        let mut m = tline.begin_modification(Lsn(0x10));
-        m.put_checkpoint(dispatch_pgversion!(
-            tline.pg_version,
-            pgv::ZERO_CHECKPOINT.clone()
-        ))?;
+        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
        m.commit(ctx).await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -174,7 +174,6 @@ impl DecodedWALRecord {
                }
                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                _ => {
                    panic!("Unsupported postgres version {pg_version}")
                }
@@ -342,47 +341,16 @@ pub mod v14 {
            }
        }
    }
-
-    #[repr(C)]
-    #[derive(Debug)]
-    pub struct XlParameterChange {
-        pub max_connections: i32,
-        pub max_worker_processes: i32,
-        pub max_wal_senders: i32,
-        pub max_prepared_xacts: i32,
-        pub max_locks_per_xact: i32,
-        pub wal_level: i32,
-        pub wal_log_hints: bool,
-        pub track_commit_timestamp: bool,
-        pub _padding: [u8; 2],
-    }
-
-    impl XlParameterChange {
-        pub fn decode(buf: &mut Bytes) -> XlParameterChange {
-            XlParameterChange {
-                max_connections: buf.get_i32_le(),
-                max_worker_processes: buf.get_i32_le(),
-                max_wal_senders: buf.get_i32_le(),
-                max_prepared_xacts: buf.get_i32_le(),
-                max_locks_per_xact: buf.get_i32_le(),
-                wal_level: buf.get_i32_le(),
-                wal_log_hints: buf.get_u8() != 0,
-                track_commit_timestamp: buf.get_u8() != 0,
-                _padding: [buf.get_u8(), buf.get_u8()],
-            }
-        }
-    }
 }

 pub mod v15 {
    pub use super::v14::{
        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
-        XlParameterChange,
    };
 }

 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
    use bytes::{Buf, Bytes};
    use postgres_ffi::{OffsetNumber, TransactionId};

@@ -561,37 +529,6 @@ pub mod v16 {
    }
 }

-pub mod v17 {
-    pub use super::v14::XlHeapLockUpdated;
-    use bytes::{Buf, Bytes};
-    pub use postgres_ffi::{TimeLineID, TimestampTz};
-
-    pub use super::v16::rm_neon;
-    pub use super::v16::{
-        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
-    };
-
-    #[repr(C)]
-    #[derive(Debug)]
-    pub struct XlEndOfRecovery {
-        pub end_time: TimestampTz,
-        pub this_time_line_id: TimeLineID,
-        pub prev_time_line_id: TimeLineID,
-        pub wal_level: i32,
-    }
-
-    impl XlEndOfRecovery {
-        pub fn decode(buf: &mut Bytes) -> XlEndOfRecovery {
-            XlEndOfRecovery {
-                end_time: buf.get_i64_le(),
-                this_time_line_id: buf.get_u32_le(),
-                prev_time_line_id: buf.get_u32_le(),
-                wal_level: buf.get_i32_le(),
-            }
-        }
-    }
-}
-
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrCreate {
@@ -809,13 +746,9 @@ pub struct XlClogTruncate {
 }

 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlClogTruncate {
        XlClogTruncate {
-            pageno: if pg_version < 17 {
-                buf.get_u32_le()
-            } else {
-                buf.get_u64_le() as u32
-            },
+            pageno: buf.get_u32_le(),
            oldest_xid: buf.get_u32_le(),
            oldest_xid_db: buf.get_u32_le(),
        }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -35,7 +35,6 @@ use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
-use std::future::Future;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
@@ -297,97 +296,6 @@ impl PostgresRedoManager {
        }
    }

-    async fn do_with_walredo_process<
-        F: FnOnce(Arc<Process>) -> Fut,
-        Fut: Future<Output = Result<O, Error>>,
-        O,
-    >(
-        &self,
-        pg_version: u32,
-        closure: F,
-    ) -> Result<O, Error> {
-        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
-            Ok(guard) => match &*guard {
-                ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
-                ProcessOnceCell::ManagerShutDown => {
-                    return Err(Error::Cancelled);
-                }
-            },
-            Err(permit) => {
-                let start = Instant::now();
-                // acquire guard before spawning process, so that we don't spawn new processes
-                // if the gate is already closed.
-                let _launched_processes_guard = match self.launched_processes.enter() {
-                    Ok(guard) => guard,
-                    Err(GateError::GateClosed) => unreachable!(
-                        "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
-                    ),
-                };
-                let proc = Arc::new(Process {
-                    process: process::WalRedoProcess::launch(
-                        self.conf,
-                        self.tenant_shard_id,
-                        pg_version,
-                    )
-                    .context("launch walredo process")?,
-                    _launched_processes_guard,
-                });
-                let duration = start.elapsed();
-                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                info!(
-                    elapsed_ms = duration.as_millis(),
-                    pid = proc.id(),
-                    "launched walredo process"
-                );
-                self.redo_process
-                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
-                proc
-            }
-        };
-
-        // async closures are unstable, would support &Process
-        let result = closure(proc.clone()).await;
-
-        if result.is_err() {
-            // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
-            // Note that there may be other tasks concurrent with us that also hold `proc`.
-            // We have to deal with that here.
-            // Also read the doc comment on field `self.redo_process`.
-            //
-            // NB: there may still be other concurrent threads using `proc`.
-            // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-            //
-            // NB: the drop impl blocks the dropping thread with a wait() system call for
-            // the child process. In some ways the blocking is actually good: if we
-            // deferred the waiting into the background / to tokio if we used `tokio::process`,
-            // it could happen that if walredo always fails immediately, we spawn processes faster
-            // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-            // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-            // This probably needs revisiting at some later point.
-            match self.redo_process.get() {
-                None => (),
-                Some(guard) => {
-                    match &*guard {
-                        ProcessOnceCell::ManagerShutDown => {}
-                        ProcessOnceCell::Spawned(guard_proc) => {
-                            if Arc::ptr_eq(&proc, guard_proc) {
-                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                                guard.take_and_deinit();
-                            } else {
-                                // Another task already spawned another redo process (further up in this method)
-                                // and put it into `redo_process`. Do nothing, our view of the world is behind.
-                            }
-                        }
-                    }
-                }
-            }
-            // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
-            drop(proc);
-        }
-
-        result
-    }
-
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
@@ -411,63 +319,130 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let base_img = &base_img;
-            let closure = |proc: Arc<Process>| async move {
-                let started_at = std::time::Instant::now();
-
-                // Relational WAL records are applied using wal-redo-postgres
-                let result = proc
-                    .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-                    .context("apply_wal_records");
-
-                let duration = started_at.elapsed();
-
-                let len = records.len();
-                let nbytes = records.iter().fold(0, |acumulator, record| {
-                    acumulator
-                        + match &record.1 {
-                            NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                            _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                        }
-                });
-
-                WAL_REDO_TIME.observe(duration.as_secs_f64());
-                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
-
-                debug!(
-                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                    len,
-                    nbytes,
-                    duration.as_micros(),
-                    lsn
-                );
-
-                if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
+            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => match &*guard {
+                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                    ProcessOnceCell::ManagerShutDown => {
+                        return Err(Error::Cancelled);
+                    }
+                },
+                Err(permit) => {
+                    let start = Instant::now();
+                    // acquire guard before spawning process, so that we don't spawn new processes
+                    // if the gate is already closed.
+                    let _launched_processes_guard = match self.launched_processes.enter() {
+                                Ok(guard) => guard,
+                                Err(GateError::GateClosed) => unreachable!(
+                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                                ),
+                            };
+                    let proc = Arc::new(Process {
+                        process: process::WalRedoProcess::launch(
+                            self.conf,
+                            self.tenant_shard_id,
+                            pg_version,
+                        )
+                        .context("launch walredo process")?,
+                        _launched_processes_guard,
+                    });
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
                    );
+                    self.redo_process
+                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                    proc
                }
-
-                result.map_err(Error::Other)
            };
-            let result = self.do_with_walredo_process(pg_version, closure).await;

-            if result.is_ok() && n_attempts != 0 {
+            let started_at = std::time::Instant::now();
+
+            // Relational WAL records are applied using wal-redo-postgres
+            let result = proc
+                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
+                .context("apply_wal_records");
+
+            let duration = started_at.elapsed();
+
+            let len = records.len();
+            let nbytes = records.iter().fold(0, |acumulator, record| {
+                acumulator
+                    + match &record.1 {
+                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                    }
+            });
+
+            WAL_REDO_TIME.observe(duration.as_secs_f64());
+            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+
+            debug!(
+                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                len,
+                nbytes,
+                duration.as_micros(),
+                lsn
+            );
+
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if let Err(e) = result.as_ref() {
+                error!(
+                    "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                    records.len(),
+                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                    nbytes,
+                    base_img_lsn,
+                    lsn,
+                    n_attempts,
+                    e,
+                );
+                // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation.
+                // Note that there may be other tasks concurrent with us that also hold `proc`.
+                // We have to deal with that here.
+                // Also read the doc comment on field `self.redo_process`.
+                //
+                // NB: there may still be other concurrent threads using `proc`.
+                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
+                //
+                // NB: the drop impl blocks the dropping thread with a wait() system call for
+                // the child process. In some ways the blocking is actually good: if we
+                // deferred the waiting into the background / to tokio if we used `tokio::process`,
+                // it could happen that if walredo always fails immediately, we spawn processes faster
+                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
+                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
+                // This probably needs revisiting at some later point.
+                match self.redo_process.get() {
+                    None => (),
+                    Some(guard) => {
+                        match &*guard {
+                            ProcessOnceCell::ManagerShutDown => {}
+                            ProcessOnceCell::Spawned(guard_proc) => {
+                                if Arc::ptr_eq(&proc, guard_proc) {
+                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                    guard.take_and_deinit();
+                                } else {
+                                    // Another task already spawned another redo process (further up in this method)
+                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                                }
+                            }
+                        }
+                    }
+                }
+                // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall.
+                drop(proc);
+            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
            n_attempts += 1;
            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result;
+                return result.map_err(Error::Other);
            }
        }
    }
--- a/pgxn/neon/bitmap.h
+++ b/pgxn/neon/bitmap.h
@@ -1,12 +0,0 @@
-#ifndef NEON_BITMAP_H
-#define NEON_BITMAP_H
-
-/*
- * Utilities for manipulating bits8* as bitmaps.
- */
-
-#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
-#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
-#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
-
-#endif //NEON_BITMAP_H
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -27,7 +27,6 @@
 #include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
-#include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
@@ -41,7 +40,6 @@
 #include "utils/guc.h"

 #include "hll.h"
-#include "bitmap.h"

 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

@@ -471,99 +469,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	return found;
 }

-/*
- * Check if page is present in the cache.
- * Returns true if page is found in local cache.
- */
-int
-lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-					int nblocks, bits8 *bitmap)
-{
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	uint32		chunk_offs;
-	int			found = 0;
-	uint32		hash;
-	int			i = 0;
-
-	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return 0;
-
-	CopyNRelFileInfoToBufTag(tag, rinfo);
-	tag.forkNum = forkNum;
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-
-	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
-	hash = get_hash_value(lfc_hash, &tag);
-	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
-
-	LWLockAcquire(lfc_lock, LW_SHARED);
-
-	while (true)
-	{
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
-		if (LFC_ENABLED())
-		{
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-			if (entry != NULL)
-			{
-				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
-				{
-					if ((entry->bitmap[chunk_offs >> 5] & 
-						(1 << (chunk_offs & 31))) != 0)
-					{
-						BITMAP_SET(bitmap, i);
-						found++;
-					}
-				}
-			}
-			else
-			{
-				i += this_chunk;
-			}
-		}
-		else
-		{
-			return found;
-		}
-
-		/*
-		 * Break out of the iteration before doing expensive stuff for
-		 * a next iteration
-		 */
-		if (i + 1 >= nblocks)
-			break;
-
-		/*
-		 * Prepare for the next iteration. We don't unlock here, as that'd
-		 * probably be more expensive than the gains it'd get us.
-		 */
-		tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
-		hash = get_hash_value(lfc_hash, &tag);
-		chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
-	}
-
-	LWLockRelease(lfc_lock);
-
-#if USE_ASSERT_CHECKING
-	do {
-		int count = 0;
-
-		for (int j = 0; j < nblocks; j++)
-		{
-			if (BITMAP_ISSET(bitmap, j))
-				count++;
-		}
-
-		Assert(count == found);
-	} while (false);
-#endif
-
-	return found;
-}
-
 /*
 * Evict a page (if present) from the local file cache
 */
@@ -643,171 +548,91 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 }

 /*
- * Try to read pages from local cache.
- * Returns the number of pages read from the local cache, and sets bits in
- * 'read' for the pages which were read. This may scribble over buffers not
- * marked in 'read', so be careful with operation ordering.
- *
- * In case of error local file cache is disabled (lfc->limit is set to zero),
- * and -1 is returned. Note that 'read' and the buffers may be touched and in
- * an otherwise invalid state.
- *
- * If the mask argument is supplied, bits will be set at the offsets of pages
- * that were present and read from the LFC.
+ * Try to read page from local cache.
+ * Returns true if page is found in local cache.
+ * In case of error local file cache is disabled (lfc->limit is set to zero).
 */
-int
-lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 void **buffers, BlockNumber nblocks, bits8 *mask)
+bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 char *buffer)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	bool		result = true;
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
-	int			blocks_read = 0;
-	int			buf_offset = 0;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
-		return 0;
+		return false;

 	if (!lfc_ensure_opened())
-		return 0;
+		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	hash = get_hash_value(lfc_hash, &tag);

-	/* 
-	 * For every chunk that has blocks we're interested in, we
-	 * 1. get the chunk header
-	 * 2. Check if the chunk actually has the blocks we're interested in
-	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
-	 * 4. Update the statistics for the read call.
-	 *
-	 * If there is an error, we do an early return.
-	 */
-	while (nblocks > 0)
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (!LFC_ENABLED())
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
-		int		iteration_hits = 0;
-		int		iteration_misses = 0;
-		Assert(blocks_in_chunk > 0);
-
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			iov[i].iov_base = buffers[buf_offset + i];
-			iov[i].iov_len = BLCKSZ;
-		}
-
-		tag.blockNum = blkno - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
-
-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		/* We can return the blocks we've read before LFC got disabled;
-		 * assuming we read any. */
-		if (!LFC_ENABLED())
-		{
-			LWLockRelease(lfc_lock);
-			return blocks_read;
-		}
-
-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
-		if (entry == NULL)
-		{
-			/* Pages are not cached */
-			lfc_ctl->misses += blocks_in_chunk;
-			pgBufferUsage.file_cache.misses += blocks_in_chunk;
-			LWLockRelease(lfc_lock);
-
-			buf_offset += blocks_in_chunk;
-			nblocks -= blocks_in_chunk;
-			blkno += blocks_in_chunk;
-
-			continue;
-		}
-
-		/* Unlink entry from LRU list to pin it for the duration of IO operation */
-		if (entry->access_count++ == 0)
-			dlist_delete(&entry->list_node);
-
-		generation = lfc_ctl->generation;
-		entry_offset = entry->offset;
-
 		LWLockRelease(lfc_lock);
-
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			/*
-			 * If the page is valid, we consider it "read".
-			 * All other pages will be fetched separately by the next cache
-			 */
-			if (entry->bitmap[(chunk_offs + i) / 32] & (1 << ((chunk_offs + i) % 32)))
-			{
-				BITMAP_SET(mask, buf_offset + i);
-				iteration_hits++;
-			}
-			else
-				iteration_misses++;
-		}
-
-		Assert(iteration_hits + iteration_misses > 0);
-
-		if (iteration_hits != 0)
-		{
-			rc = preadv(lfc_desc, iov, blocks_in_chunk,
-						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-
-			if (rc != (BLCKSZ * blocks_in_chunk))
-			{
-				lfc_disable("read");
-				return -1;
-			}
-		}
-
-		/* Place entry to the head of LRU list */
-		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-		if (lfc_ctl->generation == generation)
-		{
-			CriticalAssert(LFC_ENABLED());
-			lfc_ctl->hits += iteration_hits;
-			lfc_ctl->misses += iteration_misses;
-			pgBufferUsage.file_cache.hits += iteration_hits;
-			pgBufferUsage.file_cache.misses += iteration_misses;
-			CriticalAssert(entry->access_count > 0);
-			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-		}
-		else
-		{
-			/* generation mismatch, assume error condition */
-			LWLockRelease(lfc_lock);
-			return -1;
-		}
-
-		LWLockRelease(lfc_lock);
-
-		buf_offset += blocks_in_chunk;
-		nblocks -= blocks_in_chunk;
-		blkno += blocks_in_chunk;
-		blocks_read += iteration_hits;
+		return false;
 	}

-	return blocks_read;
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+
+	/* Approximate working set */
+	tag.blockNum = blkno;
+	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+
+	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
+	{
+		/* Page is not cached */
+		lfc_ctl->misses += 1;
+		pgBufferUsage.file_cache.misses += 1;
+		LWLockRelease(lfc_lock);
+		return false;
+	}
+	/* Unlink entry from LRU list to pin it for the duration of IO operation */
+	if (entry->access_count++ == 0)
+		dlist_delete(&entry->list_node);
+	generation = lfc_ctl->generation;
+	entry_offset = entry->offset;
+
+	LWLockRelease(lfc_lock);
+
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	if (rc != BLCKSZ)
+	{
+		lfc_disable("read");
+		return false;
+	}
+
+	/* Place entry to the head of LRU list */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (lfc_ctl->generation == generation)
+	{
+		CriticalAssert(LFC_ENABLED());
+		lfc_ctl->hits += 1;
+		pgBufferUsage.file_cache.hits += 1;
+		CriticalAssert(entry->access_count > 0);
+		if (--entry->access_count == 0)
+			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+	}
+	else
+		result = false;
+
+	LWLockRelease(lfc_lock);
+
+	return result;
 }

 /*
@@ -815,17 +640,20 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
-lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		   const void *const *buffers, BlockNumber nblocks)
+#if PG_MAJORVERSION_NUM < 16
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
+#else
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
+#endif
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
+	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 	uint32		hash;
 	uint64		generation;
 	uint32		entry_offset;
-	int			buf_offset = 0;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -833,142 +661,110 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (!lfc_ensure_opened())
 		return;

-	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	CopyNRelFileInfoToBufTag(tag, rinfo);

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	hash = get_hash_value(lfc_hash, &tag);

-	/* 
-	 * For every chunk that has blocks we're interested in, we
-	 * 1. get the chunk header
-	 * 2. Check if the chunk actually has the blocks we're interested in
-	 * 3. Read the blocks we're looking for (in one preadv), assuming they exist
-	 * 4. Update the statistics for the read call.
-	 *
-	 * If there is an error, we do an early return.
-	 */
-	while (nblocks > 0)
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	if (!LFC_ENABLED())
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
-		Assert(blocks_in_chunk > 0);
+		LWLockRelease(lfc_lock);
+		return;
+	}

-		for (int i = 0; i < blocks_in_chunk; i++)
+	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+
+	if (found)
+	{
+		/*
+		 * Unlink entry from LRU list to pin it for the duration of IO
+		 * operation
+		 */
+		if (entry->access_count++ == 0)
+			dlist_delete(&entry->list_node);
+	}
+	else
+	{
+		/*
+		 * We have two choices if all cache pages are pinned (i.e. used in IO
+		 * operations):
+		 *
+		 * 1) Wait until some of this operation is completed and pages is
+		 * unpinned.
+		 *
+		 * 2) Allocate one more chunk, so that specified cache size is more
+		 * recommendation than hard limit.
+		 *
+		 * As far as probability of such event (that all pages are pinned) is
+		 * considered to be very very small: there are should be very large
+		 * number of concurrent IO operations and them are limited by
+		 * max_connections, we prefer not to complicate code and use second
+		 * approach.
+		 */
+		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
-			iov[i].iov_base = unconstify(void *, buffers[buf_offset + i]);
-			iov[i].iov_len = BLCKSZ;
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+
+			CriticalAssert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+			neon_log(DEBUG2, "Swap file cache page");
 		}
+		else if (!dlist_is_empty(&lfc_ctl->holes))
+		{
+			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
+			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
+			uint32		offset = hole->offset;
+			bool		found;

-		tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-		hash = get_hash_value(lfc_hash, &tag);
+			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
+			CriticalAssert(found);

+			lfc_ctl->used += 1;
+			entry->offset = offset;	/* reuse the hole */
+		}
+		else
+		{
+			lfc_ctl->used += 1;
+			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
+												 * of file */
+		}
+		entry->access_count = 1;
+		entry->hash = hash;
+		memset(entry->bitmap, 0, sizeof entry->bitmap);
+	}
+
+	generation = lfc_ctl->generation;
+	entry_offset = entry->offset;
+	lfc_ctl->writes += 1;
+	LWLockRelease(lfc_lock);
+
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	if (rc != BLCKSZ)
+	{
+		lfc_disable("write");
+	}
+	else
+	{
 		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

-		if (!LFC_ENABLED())
+		if (lfc_ctl->generation == generation)
 		{
-			LWLockRelease(lfc_lock);
-			return;
+			CriticalAssert(LFC_ENABLED());
+			/* Place entry to the head of LRU list */
+			CriticalAssert(entry->access_count > 0);
+			if (--entry->access_count == 0)
+				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+
+			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}

-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-		if (found)
-		{
-			/*
-			 * Unlink entry from LRU list to pin it for the duration of IO
-			 * operation
-			 */
-			if (entry->access_count++ == 0)
-				dlist_delete(&entry->list_node);
-		}
-		else
-		{
-			/*
-			 * We have two choices if all cache pages are pinned (i.e. used in IO
-			 * operations):
-			 *
-			 * 1) Wait until some of this operation is completed and pages is
-			 * unpinned.
-			 *
-			 * 2) Allocate one more chunk, so that specified cache size is more
-			 * recommendation than hard limit.
-			 *
-			 * As far as probability of such event (that all pages are pinned) is
-			 * considered to be very very small: there are should be very large
-			 * number of concurrent IO operations and them are limited by
-			 * max_connections, we prefer not to complicate code and use second
-			 * approach.
-			 */
-			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
-			{
-				/* Cache overflow: evict least recently used chunk */
-				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-	
-				CriticalAssert(victim->access_count == 0);
-				entry->offset = victim->offset; /* grab victim's chunk */
-				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-				neon_log(DEBUG2, "Swap file cache page");
-			}
-			else if (!dlist_is_empty(&lfc_ctl->holes))
-			{
-				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-				uint32		offset = hole->offset;
-				bool		found;
-	
-				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
-				CriticalAssert(found);
-	
-				lfc_ctl->used += 1;
-				entry->offset = offset;	/* reuse the hole */
-			}
-			else
-			{
-				lfc_ctl->used += 1;
-				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-													 * of file */
-			}
-			entry->access_count = 1;
-			entry->hash = hash;
-			memset(entry->bitmap, 0, sizeof entry->bitmap);
-		}
-
-		generation = lfc_ctl->generation;
-		entry_offset = entry->offset;
-		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);
-
-		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
-					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-		if (rc != BLCKSZ * blocks_in_chunk)
-		{
-			lfc_disable("write");
-		}
-		else
-		{
-			LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-			if (lfc_ctl->generation == generation)
-			{
-				CriticalAssert(LFC_ENABLED());
-				/* Place entry to the head of LRU list */
-				CriticalAssert(entry->access_count > 0);
-				if (--entry->access_count == 0)
-					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
-
-				for (int i = 0; i < blocks_in_chunk; i++)
-				{
-					entry->bitmap[(chunk_offs + i) >> 5] |=
-						(1 << ((chunk_offs + i) & 31));
-				}
-			}
-
-			LWLockRelease(lfc_lock);
-		}
-		blkno += blocks_in_chunk;
-		buf_offset += blocks_in_chunk;
-		nblocks -= blocks_in_chunk;
 	}
 }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -537,11 +537,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();

-#if PG_MAJORVERSION_NUM >= 17
-		shard->wes_read = CreateWaitEventSet(NULL, 3);
-#else
 		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-#endif
 		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
 						  MyLatch, NULL);
 		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,11 +6,7 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H

-#if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
-#else
-#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != INVALID_PROC_NUMBER)
-#endif

 #define RelFileInfoEquals(a, b) ( \
 	NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \
@@ -54,7 +50,7 @@
 #define CopyNRelFileInfoToBufTag(tag, rinfo) \
 	do { \
 		(tag).rnode = (rinfo); \
-	} while (false)
+	} while (false);

 #define BufTagGetNRelFileInfo(tag) tag.rnode

@@ -102,7 +98,7 @@
 		(tag).spcOid = (rinfo).spcOid; \
 		(tag).dbOid = (rinfo).dbOid; \
 		(tag).relNumber = (rinfo).relNumber; \
-	} while (false)
+	} while (false);

 #define BufTagGetNRelFileInfo(tag) \
 	((RelFileLocator) { \
@@ -117,10 +113,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#if PG_MAJORVERSION_NUM < 17
-#define ProcNumber BackendId
-#define INVALID_PROC_NUMBER InvalidBackendId
-#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
-#endif
-
 #endif							/* NEON_PGVERSIONCOMPAT_H */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -6,6 +6,8 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
+ * contrib/neon/pagestore_client.h
+ *
 *-------------------------------------------------------------------------
 */
 #ifndef pageserver_h
@@ -185,7 +187,7 @@ extern char *nm_to_string(NeonMessage *msg);
 * API
 */

-typedef uint16 shardno_t;
+typedef unsigned shardno_t;

 typedef struct
 {
@@ -209,7 +211,7 @@ extern int  neon_protocol_version;

 extern shardno_t get_shard_number(BufferTag* tag);

-extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);

@@ -231,13 +233,8 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum, int nbuffers, bool skipFsync);
 #endif

-#if PG_MAJORVERSION_NUM >=17
-extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
-						  BlockNumber blocknum, int nblocks);
-#else
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
-#endif

 /*
 * LSN values associated with each request to the pageserver
@@ -272,11 +269,19 @@ typedef struct
 } neon_request_lsns;

 #if PG_MAJORVERSION_NUM < 16
+extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, char *buffer);
+extern void neon_write(SMgrRelation reln, ForkNumber forknum,
+					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
+extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 										 neon_request_lsns request_lsns, void *buffer);
+extern void neon_write(SMgrRelation reln, ForkNumber forknum,
+					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
 extern void neon_writeback(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, BlockNumber nblocks);
@@ -294,34 +299,17 @@ extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockN
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

 /* functions for local file cache */
-extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
-					   BlockNumber blkno, const void *const *buffers,
-					   BlockNumber nblocks);
-/* returns number of blocks read, with one bit set in *read for each  */
-extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
-							BlockNumber blkno, void **buffers,
-							BlockNumber nblocks, bits8 *mask);
-
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno);
-extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno, int nblocks, bits8 *bitmap);
+#if PG_MAJORVERSION_NUM < 16
+extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					  char *buffer);
+#else
+extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					  const void *buffer);
+#endif
+extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);

-static inline bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 void *buffer)
-{
-	bits8		rv = 0;
-	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
-}
-
-static inline void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		  const void *buffer)
-{
-	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
-}

 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -81,7 +81,6 @@ static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
-static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
 static void walprop_register_bgworker(void);

@@ -91,7 +90,7 @@ static void walprop_pg_init_bgworker(void);
 static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);

-static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
+static process_interrupts_callback_t PrevProcessInterruptsCallback;
 static shmem_startup_hook_type prev_shmem_startup_hook_type;
 #if PG_VERSION_NUM >= 150000
 static shmem_request_hook_type prev_shmem_request_hook = NULL;
@@ -179,7 +178,7 @@ pg_init_walproposer(void)

 	nwp_prepare_shmem();

-	delay_backend_us = &startup_backpressure_wrap;
+	delay_backend_us = &backpressure_lag_impl;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;

@@ -353,22 +352,6 @@ backpressure_lag_impl(void)
 	return 0;
 }

-/*
- * We don't apply backpressure when we're the postmaster, or the startup
- * process, because in postmaster we can't apply backpressure, and in
- * the startup process we can't afford to slow down.
- */
-static uint64
-startup_backpressure_wrap(void)
-{
-	if (AmStartupProcess() || !IsUnderPostmaster)
-		return 0;
-
-	delay_backend_us = &backpressure_lag_impl;
-
-	return backpressure_lag_impl();
-}
-
 /*
 * WalproposerShmemSize --- report amount of shared memory space needed
 */
@@ -418,13 +401,12 @@ WalproposerShmemInit_SyncSafekeeper(void)
 static bool
 backpressure_throttling_impl(void)
 {
-	uint64		lag;
+	int64		lag;
 	TimestampTz start,
 				stop;
-	bool		retry = false;
-
-	if (PointerIsValid(PrevProcessInterruptsCallback))
-		retry = PrevProcessInterruptsCallback();
+	bool		retry = PrevProcessInterruptsCallback
+		? PrevProcessInterruptsCallback()
+		: false;

 	/*
 	 * Don't throttle read only transactions or wal sender. Do throttle CREATE
@@ -620,12 +602,7 @@ walprop_pg_init_walsender(void)
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
-#if PG_MAJORVERSION_NUM >= 17
-		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT,
-							  false, false, false);
-#else
 		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
-#endif
 		ReplicationSlotReserveWal();
 		/* Write this slot to disk */
 		ReplicationSlotMarkDirty();
@@ -1532,11 +1509,7 @@ walprop_pg_init_event_set(WalProposer *wp)
 		wpg_log(FATAL, "double-initialization of event set");

 	/* for each sk, we have socket plus potentially socket for neon walreader */
-#if PG_MAJORVERSION_NUM >= 17
-	waitEvents = CreateWaitEventSet(NULL, 2 + 2 * wp->n_safekeepers);
-#else
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
-#endif
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
--- a/pgxn/neon_rmgr/neon_rmgr_decode.c
+++ b/pgxn/neon_rmgr/neon_rmgr_decode.c
@@ -1,7 +1,6 @@
 #include "postgres.h"

 #if PG_MAJORVERSION_NUM >= 16
-
 #include "access/heapam_xlog.h"
 #include "access/neon_xlog.h"
 #include "replication/decode.h"
@@ -10,10 +9,6 @@

 #include "neon_rmgr.h"

-#endif /* PG >= 16 */
-
-#if PG_MAJORVERSION_NUM == 16
-
 /* individual record(group)'s handlers */
 static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
 static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
@@ -404,398 +399,6 @@ DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
 	header->t_infomask2 = xlhdr.t_infomask2;
 	header->t_hoff = xlhdr.t_hoff;
 }
-#endif

-#if PG_MAJORVERSION_NUM == 17

-/* individual record(group)'s handlers */
-static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
-static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
-static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
-static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
-
-/* common function to decode tuples */
-static void DecodeXLogTuple(char *data, Size len, HeapTuple tuple);
-
-
-void
-neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
-{
-	uint8		info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK;
-	TransactionId xid = XLogRecGetXid(buf->record);
-	SnapBuild  *builder = ctx->snapshot_builder;
-
-	ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
-
-	/*
-	 * If we don't have snapshot or we are just fast-forwarding, there is no
-	 * point in decoding data changes.
-	 */
-	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
-		ctx->fast_forward)
-		return;
-
-	switch (info)
-	{
-		case XLOG_NEON_HEAP_INSERT:
-			if (SnapBuildProcessChange(builder, xid, buf->origptr))
-				DecodeNeonInsert(ctx, buf);
-			break;
-		case XLOG_NEON_HEAP_DELETE:
-			if (SnapBuildProcessChange(builder, xid, buf->origptr))
-				DecodeNeonDelete(ctx, buf);
-			break;
-		case XLOG_NEON_HEAP_UPDATE:
-		case XLOG_NEON_HEAP_HOT_UPDATE:
-			if (SnapBuildProcessChange(builder, xid, buf->origptr))
-				DecodeNeonUpdate(ctx, buf);
-			break;
-		case XLOG_NEON_HEAP_LOCK:
-			break;
-		case XLOG_NEON_HEAP_MULTI_INSERT:
-			if (SnapBuildProcessChange(builder, xid, buf->origptr))
-				DecodeNeonMultiInsert(ctx, buf);
-			break;
-		default:
-			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
-			break;
-	}
-}
-
-static inline bool
-FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id)
-{
-	if (ctx->callbacks.filter_by_origin_cb == NULL)
-		return false;
-
-	return filter_by_origin_cb_wrapper(ctx, origin_id);
-}
-
-/*
- * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
- *
- * Deletes can contain the new tuple.
- */
-static void
-DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
-{
-	Size		datalen;
-	char	   *tupledata;
-	Size		tuplelen;
-	XLogReaderState *r = buf->record;
-	xl_neon_heap_insert *xlrec;
-	ReorderBufferChange *change;
-	RelFileLocator target_locator;
-
-	xlrec = (xl_neon_heap_insert *) XLogRecGetData(r);
-
-	/*
-	 * Ignore insert records without new tuples (this does happen when
-	 * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
-	 */
-	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
-		return;
-
-	/* only interested in our database */
-	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
-	if (target_locator.dbOid != ctx->slot->data.database)
-		return;
-
-	/* output plugin doesn't look for this origin, no need to queue */
-	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
-		return;
-
-	change = ReorderBufferGetChange(ctx->reorder);
-	if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE))
-		change->action = REORDER_BUFFER_CHANGE_INSERT;
-	else
-		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT;
-	change->origin_id = XLogRecGetOrigin(r);
-
-	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
-
-	tupledata = XLogRecGetBlockData(r, 0, &datalen);
-	tuplelen = datalen - SizeOfHeapHeader;
-
-	change->data.tp.newtuple =
-		ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
-
-	DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
-
-	change->data.tp.clear_toast_afterwards = true;
-
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
-							 change,
-							 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
-}
-
-/*
- * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
- *
- * Deletes can possibly contain the old primary key.
- */
-static void
-DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
-{
-	XLogReaderState *r = buf->record;
-	xl_neon_heap_delete *xlrec;
-	ReorderBufferChange *change;
-	RelFileLocator target_locator;
-
-	xlrec = (xl_neon_heap_delete *) XLogRecGetData(r);
-
-	/* only interested in our database */
-	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
-	if (target_locator.dbOid != ctx->slot->data.database)
-		return;
-
-	/* output plugin doesn't look for this origin, no need to queue */
-	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
-		return;
-
-	change = ReorderBufferGetChange(ctx->reorder);
-
-	if (xlrec->flags & XLH_DELETE_IS_SUPER)
-		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT;
-	else
-		change->action = REORDER_BUFFER_CHANGE_DELETE;
-
-	change->origin_id = XLogRecGetOrigin(r);
-
-	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
-
-	/* old primary key stored */
-	if (xlrec->flags & XLH_DELETE_CONTAINS_OLD)
-	{
-		Size		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader;
-		Size		tuplelen = datalen - SizeOfNeonHeapHeader;
-
-		Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader));
-
-		change->data.tp.oldtuple =
-			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
-
-		DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete,
-						datalen, change->data.tp.oldtuple);
-	}
-
-	change->data.tp.clear_toast_afterwards = true;
-
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
-							 change, false);
-}
-
-/*
- * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
- * in the record, from wal into proper tuplebufs.
- *
- * Updates can possibly contain a new tuple and the old primary key.
- */
-static void
-DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
-{
-	XLogReaderState *r = buf->record;
-	xl_neon_heap_update *xlrec;
-	ReorderBufferChange *change;
-	char	   *data;
-	RelFileLocator target_locator;
-
-	xlrec = (xl_neon_heap_update *) XLogRecGetData(r);
-
-	/* only interested in our database */
-	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
-	if (target_locator.dbOid != ctx->slot->data.database)
-		return;
-
-	/* output plugin doesn't look for this origin, no need to queue */
-	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
-		return;
-
-	change = ReorderBufferGetChange(ctx->reorder);
-	change->action = REORDER_BUFFER_CHANGE_UPDATE;
-	change->origin_id = XLogRecGetOrigin(r);
-	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
-
-	if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE)
-	{
-		Size		datalen;
-		Size		tuplelen;
-
-		data = XLogRecGetBlockData(r, 0, &datalen);
-
-		tuplelen = datalen - SizeOfNeonHeapHeader;
-
-		change->data.tp.newtuple =
-			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
-
-		DecodeXLogTuple(data, datalen, change->data.tp.newtuple);
-	}
-
-	if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD)
-	{
-		Size		datalen;
-		Size		tuplelen;
-
-		/* caution, remaining data in record is not aligned */
-		data = XLogRecGetData(r) + SizeOfNeonHeapUpdate;
-		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate;
-		tuplelen = datalen - SizeOfNeonHeapHeader;
-
-		change->data.tp.oldtuple =
-			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
-
-		DecodeXLogTuple(data, datalen, change->data.tp.oldtuple);
-	}
-
-	change->data.tp.clear_toast_afterwards = true;
-
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
-							 change, false);
-}
-
-/*
- * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
- *
- * Currently MULTI_INSERT will always contain the full tuples.
- */
-static void
-DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
-{
-	XLogReaderState *r = buf->record;
-	xl_neon_heap_multi_insert *xlrec;
-	int			i;
-	char	   *data;
-	char	   *tupledata;
-	Size		tuplelen;
-	RelFileLocator rlocator;
-
-	xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r);
-
-	/*
-	 * Ignore insert records without new tuples.  This happens when a
-	 * multi_insert is done on a catalog or on a non-persistent relation.
-	 */
-	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
-		return;
-
-	/* only interested in our database */
-	XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL);
-	if (rlocator.dbOid != ctx->slot->data.database)
-		return;
-
-	/* output plugin doesn't look for this origin, no need to queue */
-	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
-		return;
-
-	/*
-	 * We know that this multi_insert isn't for a catalog, so the block should
-	 * always have data even if a full-page write of it is taken.
-	 */
-	tupledata = XLogRecGetBlockData(r, 0, &tuplelen);
-	Assert(tupledata != NULL);
-
-	data = tupledata;
-	for (i = 0; i < xlrec->ntuples; i++)
-	{
-		ReorderBufferChange *change;
-		xl_neon_multi_insert_tuple *xlhdr;
-		int			datalen;
-		HeapTuple	tuple;
-		HeapTupleHeader header;
-
-		change = ReorderBufferGetChange(ctx->reorder);
-		change->action = REORDER_BUFFER_CHANGE_INSERT;
-		change->origin_id = XLogRecGetOrigin(r);
-
-		memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator));
-
-		xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data);
-		data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple;
-		datalen = xlhdr->datalen;
-
-		change->data.tp.newtuple =
-			ReorderBufferGetTupleBuf(ctx->reorder, datalen);
-
-		tuple = change->data.tp.newtuple;
-		header = tuple->t_data;
-
-		/* not a disk based tuple */
-		ItemPointerSetInvalid(&tuple->t_self);
-
-		/*
-		 * We can only figure this out after reassembling the transactions.
-		 */
-		tuple->t_tableOid = InvalidOid;
-
-		tuple->t_len = datalen + SizeofHeapTupleHeader;
-
-		memset(header, 0, SizeofHeapTupleHeader);
-
-		memcpy((char *) tuple->t_data + SizeofHeapTupleHeader,
-			   (char *) data,
-			   datalen);
-		header->t_infomask = xlhdr->t_infomask;
-		header->t_infomask2 = xlhdr->t_infomask2;
-		header->t_hoff = xlhdr->t_hoff;
-
-		/*
-		 * Reset toast reassembly state only after the last row in the last
-		 * xl_multi_insert_tuple record emitted by one heap_multi_insert()
-		 * call.
-		 */
-		if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI &&
-			(i + 1) == xlrec->ntuples)
-			change->data.tp.clear_toast_afterwards = true;
-		else
-			change->data.tp.clear_toast_afterwards = false;
-
-		ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
-								 buf->origptr, change, false);
-
-		/* move to the next xl_neon_multi_insert_tuple entry */
-		data += datalen;
-	}
-	Assert(data == tupledata + tuplelen);
-}
-
-/*
- * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
- * (but not by heap_multi_insert) into a tuplebuf.
- *
- * The size 'len' and the pointer 'data' in the record need to be
- * computed outside as they are record specific.
- */
-static void
-DecodeXLogTuple(char *data, Size len, HeapTuple tuple)
-{
-	xl_neon_heap_header xlhdr;
-	int			datalen = len - SizeOfNeonHeapHeader;
-	HeapTupleHeader header;
-
-	Assert(datalen >= 0);
-
-	tuple->t_len = datalen + SizeofHeapTupleHeader;
-	header = tuple->t_data;
-
-	/* not a disk based tuple */
-	ItemPointerSetInvalid(&tuple->t_self);
-
-	/* we can only figure this out after reassembling the transactions */
-	tuple->t_tableOid = InvalidOid;
-
-	/* data is not stored aligned, copy to aligned storage */
-	memcpy((char *) &xlhdr,
-		   data,
-		   SizeOfNeonHeapHeader);
-
-	memset(header, 0, SizeofHeapTupleHeader);
-
-	memcpy(((char *) tuple->t_data) + SizeofHeapTupleHeader,
-		   data + SizeOfNeonHeapHeader,
-		   datalen);
-
-	header->t_infomask = xlhdr.t_infomask;
-	header->t_infomask2 = xlhdr.t_infomask2;
-	header->t_hoff = xlhdr.t_hoff;
-}
-#endif
+#endif
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -68,13 +68,8 @@ static void inmem_close(SMgrRelation reln, ForkNumber forknum);
 static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
 static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo);
-#if PG_MAJORVERSION_NUM >= 17
-static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum, int nblocks);
-#else
 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum);
-#endif
 #if PG_MAJORVERSION_NUM < 16
 static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
@@ -98,9 +93,7 @@ static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
 static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber nblocks);
 static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
-#if PG_MAJORVERSION_NUM >= 17
-static void inmem_registersync(SMgrRelation reln, ForkNumber forknum);
-#endif
+

 /*
 *	inmem_init() -- Initialize private state
@@ -197,14 +190,6 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
 {
 }

-#if PG_MAJORVERSION_NUM >= 17
-static bool
-inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-			   int nblocks)
-{
-	return true;
-}
-#else
 /*
 *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
 */
@@ -213,7 +198,6 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 	return true;
 }
-#endif

 /*
 * inmem_writeback() -- Tell the kernel to write pages back to storage.
@@ -227,13 +211,11 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 /*
 *	inmem_read() -- Read the specified block from a relation.
 */
-#if PG_MAJORVERSION_NUM < 16
 static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
 		   char *buffer)
 #else
-static void
-inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   void *buffer)
 #endif
 {
@@ -246,18 +228,6 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		memcpy(buffer, page_body[pg], BLCKSZ);
 }

-#if PG_MAJORVERSION_NUM >= 17
-static void
-inmem_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-			void **buffers, BlockNumber nblocks)
-{
-	for (int i = 0; i < nblocks; i++)
-	{
-		inmem_read(reln, forknum, blkno, buffers[i]);
-	}
-}
-#endif
-
 /*
 *	inmem_write() -- Write the supplied block at the appropriate location.
 *
@@ -310,18 +280,6 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	memcpy(page_body[pg], buffer, BLCKSZ);
 }

-#if PG_MAJORVERSION_NUM >= 17
-static void
-inmem_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
-			 const void **buffers, BlockNumber nblocks, bool skipFsync)
-{
-	for (int i = 0; i < nblocks; i++)
-	{
-		inmem_write(reln, forknum, blkno, buffers[i], skipFsync);
-	}
-}
-#endif
-
 /*
 *	inmem_nblocks() -- Get the number of blocks stored in a relation.
 */
@@ -357,13 +315,6 @@ inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }

-#if PG_MAJORVERSION_NUM >= 17
-static void
-inmem_registersync(SMgrRelation reln, ForkNumber forknum)
-{
-}
-#endif
-
 static const struct f_smgr inmem_smgr =
 {
 	.smgr_init = inmem_init,
@@ -377,39 +328,23 @@ static const struct f_smgr inmem_smgr =
 #if PG_MAJORVERSION_NUM >= 16
 	.smgr_zeroextend = inmem_zeroextend,
 #endif
-#if PG_MAJORVERSION_NUM >= 17
-	.smgr_prefetch = inmem_prefetch,
-	.smgr_readv = inmem_readv,
-	.smgr_writev = inmem_writev,
-#else
 	.smgr_prefetch = inmem_prefetch,
 	.smgr_read = inmem_read,
 	.smgr_write = inmem_write,
-#endif
 	.smgr_writeback = inmem_writeback,
 	.smgr_nblocks = inmem_nblocks,
 	.smgr_truncate = inmem_truncate,
 	.smgr_immedsync = inmem_immedsync,
-
-#if PG_MAJORVERSION_NUM >= 17
-	.smgr_registersync = inmem_registersync,
-#endif
-
-	.smgr_start_unlogged_build = NULL,
-	.smgr_finish_unlogged_build_phase_1 = NULL,
-	.smgr_end_unlogged_build = NULL,
-	.smgr_read_slru_segment = NULL,
 };

 const f_smgr *
-smgr_inmem(ProcNumber backend, NRelFileInfo rinfo)
+smgr_inmem(BackendId backend, NRelFileInfo rinfo)
 {
 	Assert(InRecovery);
-	// // What does this code do?
-	// if (backend != INVALID_PROC_NUMBER)
-	// 	return smgr_standard(backend, rinfo);
-	// else
-	return &inmem_smgr;
+	if (backend != InvalidBackendId)
+		return smgr_standard(backend, rinfo);
+	else
+		return &inmem_smgr;
 }

 void
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -11,7 +11,7 @@
 #ifndef INMEM_SMGR_H
 #define INMEM_SMGR_H

-extern const f_smgr *smgr_inmem(ProcNumber backend, NRelFileInfo rinfo);
+extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_inmem(void);

 #endif /* INMEM_SMGR_H */
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -100,9 +100,6 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
-#if PG_MAJORVERSION_NUM >= 17
-#include "storage/dsm_registry.h"
-#endif
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
@@ -140,7 +137,7 @@ static BufferTag target_redo_tag;

 static XLogReaderState *reader_state;

-#define TRACE LOG
+#define TRACE DEBUG5

 #ifdef HAVE_LIBSECCOMP

@@ -520,10 +517,6 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
-#if PG_MAJORVERSION_NUM >= 17
-	DSMRegistryShmemInit();
-	VarsupShmemInit();
-#endif
 	XLOGShmemInit();
 	CLOGShmemInit();
 	CommitTsShmemInit();
@@ -573,10 +566,7 @@ CreateFakeSharedMemoryAndSemaphores()
 	/*
 	 * Set up other modules that need some shared memory space
 	 */
-#if PG_MAJORVERSION_NUM < 17
-	/* "snapshot too old" was removed in PG17, and with it the SnapMgr */
 	SnapMgrInit();
-#endif
 	BTreeShmemInit();
 	SyncScanShmemInit();
 	/* Skip due to the 'pg_notify' directory check */
@@ -752,7 +742,7 @@ BeginRedoForBlock(StringInfo input_message)
 		 target_redo_tag.forkNum,
 		 target_redo_tag.blockNum);

-	reln = smgropen(rinfo, INVALID_PROC_NUMBER, RELPERSISTENCE_PERMANENT);
+	reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -18,6 +18,7 @@ atomic-take.workspace = true
 aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
+aws-types.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
@@ -25,6 +26,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -46,9 +48,11 @@ indexmap.workspace = true
 ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
+md5.workspace = true
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -63,6 +67,7 @@ reqwest.workspace = true
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
+routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -74,6 +79,7 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
+task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
@@ -82,6 +88,7 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
+tower-service.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vlad Lazar	3640824553	fixup: incorrect assumption on img layer need	2024-09-12 23:04:58 +01:00
Vlad Lazar	fea1f34f6a	fixup: serialize again in image layer	2024-09-12 23:04:10 +01:00
Vlad Lazar	5d40d1ccdd	fixup: order of waiters in delta layer	2024-09-12 23:04:01 +01:00
Vlad Lazar	b2cb10590e	fixup: deserialize shenanigans	2024-09-12 20:00:06 +01:00
Vlad Lazar	2923fd2a5b	fixup: remove stale import	2024-09-12 19:25:46 +01:00
Vlad Lazar	2a5336b9ab	fixup image deserialization	2024-09-12 19:24:41 +01:00
Christian Schwarz	6f20726610	Merge remote-tracking branch 'origin/hackaneon/lisbon24/superscalar-page_service--problame/evaluate-debouncer' into hackaneon/lisbon24/superscalar-page_service	2024-09-12 17:47:16 +00:00
Christian Schwarz	29f741e1e9	debounce: actually issue vectored get	2024-09-12 17:46:30 +00:00
Vlad Lazar	2b37a40079	Materialize future ios	2024-09-12 18:25:17 +01:00
Vlad Lazar	af2b65a2fb	Rework issuing of IOs on read path	2024-09-12 16:42:35 +01:00
Christian Schwarz	5d194c7824	debounce: bounce if shard or effective request_lsn differ	2024-09-12 14:20:32 +00:00
Christian Schwarz	ac2702afd3	deboucner: move decoding into debounce loop	2024-09-12 10:58:09 +00:00
Christian Schwarz	88fd46d795	sketch interface	2024-09-12 11:35:00 +01:00
Christian Schwarz	2d6763882e	pagebench: fake queue depth of 10	2024-09-12 11:35:00 +01:00
Christian Schwarz	c0c23cde72	debouncer	2024-09-12 11:35:00 +01:00
Christian Schwarz	942bc9544b	fixup	2024-09-11 20:04:39 +00:00
Christian Schwarz	02b7cdb305	HACK: instrument page_service to count nonblocking consecutive getpage requests	2024-09-11 19:25:19 +01:00
				`@@ -1 +0,0 @@`
				`GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;`