Merge pull request #9921 from neondatabase/rc/release-proxy/2024-11-28

Proxy release 2024-11-28
2026-01-25 22:30:38 +00:00 · 2024-11-28 11:09:06 +01:00 · 2024-11-28 06:02:15 +00:00 · 2024-11-27 19:44:24 +00:00 · 2024-11-27 18:30:54 +00:00 · 2024-11-27 18:05:46 +00:00
100 changed files with 2516 additions and 403 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,8 +19,8 @@ on:
        description: 'debug or release'
        required: true
        type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
+      test-cfg:
+        description: 'a json object of postgres versions and lfc states to run regression tests on'
        required: true
        type: string

@@ -276,14 +276,14 @@ jobs:
      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
-      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+      matrix: ${{ fromJSON(format('{{"include":{0}}}', inputs.test-cfg)) }}
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: true

      - name: Pytest regression tests
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
@@ -300,6 +300,7 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -541,7 +541,7 @@ jobs:

    runs-on: ${{ matrix.RUNNER }}
    container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -558,12 +558,12 @@ jobs:
        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')

        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
+        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg

        mkdir -p /tmp/neon/pg_install/v16/bin
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -2,6 +2,17 @@ name: Build build-tools image

 on:
  workflow_call:
+    inputs:
+      archs:
+        description: "Json array of architectures to build"
+        # Default values are set in `check-image` job, `set-variables` step
+        type: string
+        required: false
+      debians:
+        description: "Json array of Debian versions to build"
+        # Default values are set in `check-image` job, `set-variables` step
+        type: string
+        required: false
    outputs:
      image-tag:
        description: "build-tools tag"
@@ -32,25 +43,37 @@ jobs:
  check-image:
    runs-on: ubuntu-22.04
    outputs:
-      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-      found: ${{ steps.check-image.outputs.found }}
+      archs: ${{ steps.set-variables.outputs.archs }}
+      debians: ${{ steps.set-variables.outputs.debians }}
+      tag: ${{ steps.set-variables.outputs.image-tag }}
+      everything: ${{ steps.set-more-variables.outputs.everything }}
+      found: ${{ steps.set-more-variables.outputs.found }}

    steps:
      - uses: actions/checkout@v4

-      - name: Get build-tools image tag for the current commit
-        id: get-build-tools-tag
+      - name: Set variables
+        id: set-variables
        env:
+          ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
+          DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
          IMAGE_TAG: |
            ${{ hashFiles('build-tools.Dockerfile',
                          '.github/workflows/build-build-tools-image.yml') }}
        run: |
-          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
+          echo "archs=${ARCHS}"           | tee -a ${GITHUB_OUTPUT}
+          echo "debians=${DEBIANS}"       | tee -a ${GITHUB_OUTPUT}
+          echo "image-tag=${IMAGE_TAG}"   | tee -a ${GITHUB_OUTPUT}

-      - name: Check if such tag found in the registry
-        id: check-image
+      - name: Set more variables
+        id: set-more-variables
        env:
-          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+          IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }}
+          EVERYTHING: |
+            ${{ contains(fromJson(steps.set-variables.outputs.archs), 'x64') &&
+                contains(fromJson(steps.set-variables.outputs.archs), 'arm64') &&
+                contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') &&
+                contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }}
        run: |
          if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
            found=true
@@ -58,8 +81,8 @@ jobs:
            found=false
          fi

-          echo "found=${found}" | tee -a $GITHUB_OUTPUT
-
+          echo "everything=${EVERYTHING}" | tee -a ${GITHUB_OUTPUT}
+          echo "found=${found}"           | tee -a ${GITHUB_OUTPUT}

  build-image:
    needs: [ check-image ]
@@ -67,8 +90,8 @@ jobs:

    strategy:
      matrix:
-        debian-version: [ bullseye, bookworm ]
-        arch: [ x64, arm64 ]
+        arch: ${{ fromJson(needs.check-image.outputs.archs) }}
+        debian: ${{ fromJson(needs.check-image.outputs.debians) }}

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

@@ -99,11 +122,11 @@ jobs:
          push: true
          pull: true
          build-args: |
-            DEBIAN_VERSION=${{ matrix.debian-version }}
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
+            DEBIAN_VERSION=${{ matrix.debian }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}
+            neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }}

  merge-images:
    needs: [ check-image, build-image ]
@@ -117,16 +140,22 @@ jobs:

      - name: Create multi-arch image
        env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
+          ARCHS: ${{ join(fromJson(needs.check-image.outputs.archs), ' ') }}
+          DEBIANS: ${{ join(fromJson(needs.check-image.outputs.debians), ' ') }}
+          EVERYTHING: ${{ needs.check-image.outputs.everything }}
          IMAGE_TAG: ${{ needs.check-image.outputs.tag }}
        run: |
-          for debian_version in bullseye bookworm; do
-            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
-            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+          for debian in ${DEBIANS}; do
+            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian}")
+
+            if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
              tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
            fi

-            docker buildx imagetools create "${tags[@]}" \
-                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
-                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
+            for arch in ${ARCHS}; do
+              tags+=("neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}")
+            done
+
+            docker buildx imagetools create "${tags[@]}"
          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -253,7 +253,14 @@ jobs:
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16", "v17"]' || '["v17"]' }}
+      # run without LFC on v17 release only
+      test-cfg: |
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -29,7 +29,7 @@ jobs:
  trigger_bench_on_ec2_machine_in_eu_central_1:
    runs-on: [ self-hosted, small ]
    container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -94,7 +94,7 @@ jobs:

      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
        env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
        run: |
          for debian_version in bullseye bookworm; do
            tags=()
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -23,6 +23,8 @@ jobs:
        id: python-src
        with:
          files: |
+            .github/workflows/_check-codestyle-python.yml
+            .github/workflows/build-build-tools-image.yml
            .github/workflows/pre-merge-checks.yml
            **/**.py
            poetry.lock
@@ -38,6 +40,10 @@ jobs:
    if: needs.get-changed-files.outputs.python-changed == 'true'
    needs: [ get-changed-files ]
    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      # Build only one combination to save time
+      archs: '["x64"]'
+      debians: '["bookworm"]'
    secrets: inherit

  check-codestyle-python:
@@ -45,7 +51,8 @@ jobs:
    needs: [ get-changed-files, build-build-tools-image ]
    uses: ./.github/workflows/_check-codestyle-python.yml
    with:
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      # `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
    secrets: inherit

  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2174,9 +2174,9 @@ dependencies = [

 [[package]]
 name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
 dependencies = [
 "futures-core",
 "futures-sink",
@@ -2184,9 +2184,9 @@ dependencies = [

 [[package]]
 name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"

 [[package]]
 name = "futures-executor"
@@ -2201,9 +2201,9 @@ dependencies = [

 [[package]]
 name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"

 [[package]]
 name = "futures-lite"
@@ -2222,9 +2222,9 @@ dependencies = [

 [[package]]
 name = "futures-macro"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -2233,15 +2233,15 @@ dependencies = [

 [[package]]
 name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"

 [[package]]
 name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"

 [[package]]
 name = "futures-timer"
@@ -2251,9 +2251,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"

 [[package]]
 name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
 "futures-channel",
 "futures-core",
@@ -4133,7 +4133,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4146,7 +4146,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4165,7 +4165,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4182,7 +4182,7 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4518,7 +4518,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -5231,9 +5231,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.23.16"
+version = "0.23.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
+checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f"
 dependencies = [
 "log",
 "once_cell",
@@ -5364,6 +5364,7 @@ dependencies = [
 "itertools 0.10.5",
 "metrics",
 "once_cell",
+ "pageserver_api",
 "parking_lot 0.12.1",
 "postgres",
 "postgres-protocol",
@@ -5395,6 +5396,7 @@ dependencies = [
 "tracing-subscriber",
 "url",
 "utils",
+ "wal_decoder",
 "walproposer",
 "workspace_hack",
 ]
@@ -5948,7 +5950,7 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "prost",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "tokio",
 "tonic",
 "tonic-build",
@@ -6031,7 +6033,7 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "rustls-native-certs 0.8.0",
 "serde",
 "serde_json",
@@ -6466,7 +6468,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6493,7 +6495,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
 "ring",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.26.0",
@@ -6527,7 +6529,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6936,7 +6938,7 @@ dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "rustls-pki-types",
 "url",
 "webpki-roots 0.26.1",
@@ -7021,6 +7023,7 @@ dependencies = [
 "serde_assert",
 "serde_json",
 "serde_path_to_error",
+ "serde_with",
 "signal-hook",
 "strum",
 "strum_macros",
@@ -7117,10 +7120,16 @@ name = "wal_decoder"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-compression",
 "bytes",
 "pageserver_api",
 "postgres_ffi",
+ "prost",
 "serde",
+ "thiserror",
+ "tokio",
+ "tonic",
+ "tonic-build",
 "tracing",
 "utils",
 "workspace_hack",
@@ -7598,7 +7607,7 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest 0.12.4",
- "rustls 0.23.16",
+ "rustls 0.23.18",
 "scopeguard",
 "serde",
 "serde_json",
--- a/2
+++ b/2
@@ -7,7 +7,7 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 # Build Postgres
--- a/1
+++ b/1
@@ -38,6 +38,7 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
+	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,4 +1,4 @@
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm

 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -3,7 +3,7 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 #########################################################################################
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -58,7 +58,7 @@ use compute_tools::compute::{
    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
 };
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::get_pg_version;
+use compute_tools::extension_server::get_pg_version_string;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -326,7 +326,7 @@ fn wait_spec(
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version(pgbin),
+        pgversion: get_pg_version_string(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -29,6 +29,7 @@ use anyhow::Context;
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
+use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;
@@ -131,11 +132,17 @@ pub(crate) async fn main() -> anyhow::Result<()> {
    //
    //  Initialize pgdata
    //
+    let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
+        PostgresMajorVersion::V14 => 14,
+        PostgresMajorVersion::V15 => 15,
+        PostgresMajorVersion::V16 => 16,
+        PostgresMajorVersion::V17 => 17,
+    };
    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
        superuser,
        locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
-        pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in
+        pg_version,
        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
        pgdata: &pgdata_dir,
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -103,14 +103,33 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
        .to_string()
 }

-pub fn get_pg_version(pgbin: &str) -> String {
+pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    parse_pg_version(&human_version).to_string()
+    parse_pg_version(&human_version)
 }

-fn parse_pg_version(human_version: &str) -> &str {
+pub fn get_pg_version_string(pgbin: &str) -> String {
+    match get_pg_version(pgbin) {
+        PostgresMajorVersion::V14 => "v14",
+        PostgresMajorVersion::V15 => "v15",
+        PostgresMajorVersion::V16 => "v16",
+        PostgresMajorVersion::V17 => "v17",
+    }
+    .to_owned()
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum PostgresMajorVersion {
+    V14,
+    V15,
+    V16,
+    V17,
+}
+
+fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
+    use PostgresMajorVersion::*;
    // Normal releases have version strings like "PostgreSQL 15.4". But there
    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -121,10 +140,10 @@ fn parse_pg_version(human_version: &str) -> &str {
        .captures(human_version)
    {
        Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return "v14",
-            "15" => return "v15",
-            "16" => return "v16",
-            "17" => return "v17",
+            "14" => return V14,
+            "15" => return V15,
+            "16" => return V16,
+            "17" => return V17,
            _ => {}
        },
        _ => {}
@@ -263,24 +282,25 @@ mod tests {

    #[test]
    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        use super::PostgresMajorVersion::*;
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
        assert_eq!(
            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
+            V15
        );

-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
        assert_eq!(
            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
+            V14
        );

-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
    }

    #[test]
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -415,6 +415,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'timeline_offloading' as bool")?,
+            wal_receiver_protocol_override: settings
+                .remove("wal_receiver_protocol_override")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("parse `wal_receiver_protocol_override` from json")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/deny.toml
+++ b/deny.toml
@@ -33,7 +33,6 @@ reason = "the marvin attack only affects private key decryption, not public key
 [licenses]
 allow = [
    "Apache-2.0",
-    "Artistic-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
    "CC0-1.0",
@@ -67,7 +66,7 @@ registries = []
 # More documentation about the 'bans' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
 [bans]
-multiple-versions = "warn"
+multiple-versions = "allow"
 wildcards = "allow"
 highlight = "all"
 workspace-default-features = "allow"
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -2,14 +2,28 @@

 // This module has heavy inspiration from the prometheus crate's `process_collector.rs`.

+use once_cell::sync::Lazy;
+use prometheus::Gauge;
+
 use crate::UIntGauge;

 pub struct Collector {
    descs: Vec<prometheus::core::Desc>,
    vmlck: crate::UIntGauge,
+    cpu_seconds_highres: Gauge,
 }

-const NMETRICS: usize = 1;
+const NMETRICS: usize = 2;
+
+static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
+    let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
+    if long == -1 {
+        panic!("sysconf(_SC_CLK_TCK) failed");
+    }
+    let convertible_to_f64: i32 =
+        i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32");
+    convertible_to_f64 as f64
+});

 impl prometheus::core::Collector for Collector {
    fn desc(&self) -> Vec<&prometheus::core::Desc> {
@@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector {
                mfs.extend(self.vmlck.collect())
            }
        }
+        if let Ok(stat) = myself.stat() {
+            let cpu_seconds = stat.utime + stat.stime;
+            self.cpu_seconds_highres
+                .set(cpu_seconds as f64 / *CLK_TCK_F64);
+            mfs.extend(self.cpu_seconds_highres.collect());
+        }
        mfs
    }
 }
@@ -43,7 +63,23 @@ impl Collector {
                .cloned(),
        );

-        Self { descs, vmlck }
+        let cpu_seconds_highres = Gauge::new(
+            "libmetrics_process_cpu_seconds_highres",
+            "Total user and system CPU time spent in seconds.\
+             Sub-second resolution, hence better than `process_cpu_seconds_total`.",
+        )
+        .unwrap();
+        descs.extend(
+            prometheus::core::Collector::desc(&cpu_seconds_highres)
+                .into_iter()
+                .cloned(),
+        );
+
+        Self {
+            descs,
+            vmlck,
+            cpu_seconds_highres,
+        }
    }
 }

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -18,7 +18,7 @@ use std::{
    str::FromStr,
    time::Duration,
 };
-use utils::logging::LogFormat;
+use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol};

 use crate::models::ImageCompressionAlgorithm;
 use crate::models::LsnLease;
@@ -120,6 +120,7 @@ pub struct ConfigToml {
    pub no_sync: Option<bool>,
    #[serde(with = "humantime_serde")]
    pub server_side_batch_timeout: Option<Duration>,
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -277,6 +278,8 @@ pub struct TenantConfigToml {
    /// Enable auto-offloading of timelines.
    /// (either this flag or the pageserver-global one need to be set)
    pub timeline_offloading: bool,
+
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

 pub mod defaults {
@@ -330,6 +333,9 @@ pub mod defaults {
    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

    pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
+
+    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
+        utils::postgres_client::PostgresClientProtocol::Vanilla;
 }

 impl Default for ConfigToml {
@@ -418,6 +424,7 @@ impl Default for ConfigToml {
                .map(|duration| humantime::parse_duration(duration).unwrap()),
            tenant_config: TenantConfigToml::default(),
            no_sync: None,
+            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
        }
    }
 }
@@ -505,6 +512,7 @@ impl Default for TenantConfigToml {
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: false,
+            wal_receiver_protocol_override: None,
        }
    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -229,6 +229,18 @@ impl Key {
    }
 }

+impl CompactKey {
+    pub fn raw(&self) -> i128 {
+        self.0
+    }
+}
+
+impl From<i128> for CompactKey {
+    fn from(value: i128) -> Self {
+        Self(value)
+    }
+}
+
 impl fmt::Display for Key {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,6 +23,7 @@ use utils::{
    completion,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
+    postgres_client::PostgresClientProtocol,
    serde_system_time,
 };

@@ -352,6 +353,7 @@ pub struct TenantConfig {
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
    pub timeline_offloading: Option<bool>,
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

 /// The policy for the aux file storage.
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -562,6 +562,9 @@ pub enum BeMessage<'a> {
        options: &'a [&'a str],
    },
    KeepAlive(WalSndKeepAlive),
+    /// Batch of interpreted, shard filtered WAL records,
+    /// ready for the pageserver to ingest
+    InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
 }

 /// Common shorthands.
@@ -672,6 +675,22 @@ pub struct WalSndKeepAlive {
    pub request_reply: bool,
 }

+/// Batch of interpreted WAL records used in the interpreted
+/// safekeeper to pageserver protocol.
+///
+/// Note that the pageserver uses the RawInterpretedWalRecordsBody
+/// counterpart of this from the neondatabase/rust-postgres repo.
+/// If you're changing this struct, you likely need to change its
+/// twin as well.
+#[derive(Debug)]
+pub struct InterpretedWalRecordsBody<'a> {
+    /// End of raw WAL in [`Self::data`]
+    pub streaming_lsn: u64,
+    /// Current end of WAL on the server
+    pub commit_lsn: u64,
+    pub data: &'a [u8],
+}
+
 pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);

 // single text column
@@ -996,6 +1015,19 @@ impl BeMessage<'_> {
                    Ok(())
                })?
            }
+
+            BeMessage::InterpretedWalRecords(rec) => {
+                // We use the COPY_DATA_TAG for our custom message
+                // since this tag is interpreted as raw bytes.
+                buf.put_u8(b'd');
+                write_body(buf, |buf| {
+                    buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol
+                                      // dependency
+                    buf.put_u64(rec.streaming_lsn);
+                    buf.put_u64(rec.commit_lsn);
+                    buf.put_slice(rec.data);
+                });
+            }
        }
        Ok(())
    }
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -176,7 +176,9 @@ pub(crate) struct BucketMetrics {

 impl Default for BucketMetrics {
    fn default() -> Self {
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];

        let req_seconds = register_histogram_vec!(
            "remote_storage_s3_request_seconds",
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -33,6 +33,7 @@ pprof.workspace = true
 regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -7,29 +7,88 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};

 use crate::id::TenantTimelineId;

+#[derive(Copy, Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum InterpretedFormat {
+    Bincode,
+    Protobuf,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum Compression {
+    Zstd { level: i8 },
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "type", content = "args")]
+#[serde(rename_all = "kebab-case")]
+pub enum PostgresClientProtocol {
+    /// Usual Postgres replication protocol
+    Vanilla,
+    /// Custom shard-aware protocol that replicates interpreted records.
+    /// Used to send wal from safekeeper to pageserver.
+    Interpreted {
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    },
+}
+
+pub struct ConnectionConfigArgs<'a> {
+    pub protocol: PostgresClientProtocol,
+
+    pub ttid: TenantTimelineId,
+    pub shard_number: Option<u8>,
+    pub shard_count: Option<u8>,
+    pub shard_stripe_size: Option<u32>,
+
+    pub listen_pg_addr_str: &'a str,
+
+    pub auth_token: Option<&'a str>,
+    pub availability_zone: Option<&'a str>,
+}
+
+impl<'a> ConnectionConfigArgs<'a> {
+    fn options(&'a self) -> Vec<String> {
+        let mut options = vec![
+            "-c".to_owned(),
+            format!("timeline_id={}", self.ttid.timeline_id),
+            format!("tenant_id={}", self.ttid.tenant_id),
+            format!(
+                "protocol={}",
+                serde_json::to_string(&self.protocol).unwrap()
+            ),
+        ];
+
+        if self.shard_number.is_some() {
+            assert!(self.shard_count.is_some());
+            assert!(self.shard_stripe_size.is_some());
+
+            options.push(format!("shard_count={}", self.shard_count.unwrap()));
+            options.push(format!("shard_number={}", self.shard_number.unwrap()));
+            options.push(format!(
+                "shard_stripe_size={}",
+                self.shard_stripe_size.unwrap()
+            ));
+        }
+
+        options
+    }
+}
+
 /// Create client config for fetching WAL from safekeeper on particular timeline.
 /// listen_pg_addr_str is in form host:\[port\].
 pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
+    args: ConnectionConfigArgs,
 ) -> anyhow::Result<PgConnectionConfig> {
    let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
    let port = port.unwrap_or(5432);
    let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
+        .extend_options(args.options())
+        .set_password(args.auth_token.map(|s| s.to_owned()));

-    if let Some(availability_zone) = availability_zone {
+    if let Some(availability_zone) = args.availability_zone {
        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
    }

--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -218,7 +218,7 @@ impl MemoryStatus {
    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
        struct DS<'a>(&'a [MemoryStatus]);

-        impl<'a> Debug for DS<'a> {
+        impl Debug for DS<'_> {
            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                f.debug_struct("[MemoryStatus]")
                    .field(
@@ -233,7 +233,7 @@ impl MemoryStatus {

        struct Fields<'a, F>(&'a [MemoryStatus], F);

-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+        impl<F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'_, F> {
            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
            }
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -8,11 +8,19 @@ license.workspace = true
 testing = ["pageserver_api/testing"]

 [dependencies]
+async-compression.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
 pageserver_api.workspace = true
+prost.workspace = true
 postgres_ffi.workspace = true
 serde.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
+tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/libs/wal_decoder/build.rs
+++ b/libs/wal_decoder/build.rs
@@ -0,0 +1,11 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Generate rust code from .proto protobuf.
+    //
+    // Note: we previously tried to use deterministic location at proto/ for
+    // easy location, but apparently interference with cachepot sometimes fails
+    // the build then. Anyway, per cargo docs build script shouldn't output to
+    // anywhere but $OUT_DIR.
+    tonic_build::compile_protos("proto/interpreted_wal.proto")
+        .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
+    Ok(())
+}
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -0,0 +1,43 @@
+syntax = "proto3";
+
+package interpreted_wal;
+
+message InterpretedWalRecords {
+  repeated InterpretedWalRecord records = 1;
+  optional uint64 next_record_lsn = 2;
+}
+
+message InterpretedWalRecord {
+  optional bytes metadata_record = 1;
+  SerializedValueBatch batch = 2;
+  uint64 next_record_lsn = 3;
+  bool flush_uncommitted = 4;
+  uint32 xid = 5;
+}
+
+message SerializedValueBatch {
+  bytes raw = 1;
+  repeated ValueMeta metadata = 2;
+  uint64 max_lsn = 3;
+  uint64 len = 4;
+}
+
+enum ValueMetaType {
+  Serialized = 0;
+  Observed = 1;
+}
+
+message ValueMeta {
+  ValueMetaType type = 1;
+  CompactKey key = 2;
+  uint64 lsn = 3;
+  optional uint64 batch_offset = 4;
+  optional uint64 len = 5;
+  optional bool will_init = 6;
+}
+
+message CompactKey {
+  int64 high = 1;
+  int64 low = 2;
+}
+
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -4,6 +4,7 @@
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::pg_constants;
@@ -32,7 +33,8 @@ impl InterpretedWalRecord {
            FlushUncommittedRecords::No
        };

-        let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
+        let metadata_record =
+            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
        let batch = SerializedValueBatch::from_decoded_filtered(
            decoded,
            shard,
@@ -51,8 +53,13 @@ impl InterpretedWalRecord {
 }

 impl MetadataRecord {
-    fn from_decoded(
+    /// Builds a metadata record for this WAL record, if any.
+    ///
+    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// records are broadcast to all shards for simplicity, but this should be improved.
+    fn from_decoded_filtered(
        decoded: &DecodedWALRecord,
+        shard: &ShardIdentity,
        next_record_lsn: Lsn,
        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
@@ -61,26 +68,27 @@ impl MetadataRecord {
        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);

-        match decoded.xl_rmid {
+        // First, generate metadata records from the decoded WAL record.
+        let mut metadata_record = match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
-                Self::decode_heapam_record(&mut buf, decoded, pg_version)
+                Self::decode_heapam_record(&mut buf, decoded, pg_version)?
            }
-            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?,
            // Handle other special record types
-            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
-            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?,
+            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?,
            pg_constants::RM_TBLSPC_ID => {
                tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-                Ok(None)
+                None
            }
-            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?,
            pg_constants::RM_XACT_ID => {
-                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)?
            }
            pg_constants::RM_MULTIXACT_ID => {
-                Self::decode_multixact_record(&mut buf, decoded, pg_version)
+                Self::decode_multixact_record(&mut buf, decoded, pg_version)?
            }
-            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
+            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?,
            // This is an odd duck. It needs to go to all shards.
            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
@@ -89,19 +97,48 @@ impl MetadataRecord {
            // Alternatively, one can make the checkpoint part of the subscription protocol
            // to the pageserver. This should work fine, but can be done at a later point.
            pg_constants::RM_XLOG_ID => {
-                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)?
            }
            pg_constants::RM_LOGICALMSG_ID => {
-                Self::decode_logical_message_record(&mut buf, decoded)
+                Self::decode_logical_message_record(&mut buf, decoded)?
            }
-            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
-            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
+            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?,
+            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?,
            _unexpected => {
                // TODO: consider failing here instead of blindly doing something without
                // understanding the protocol
-                Ok(None)
+                None
+            }
+        };
+
+        // Next, filter the metadata record by shard.
+
+        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+        // of the main relation. These are sharded and managed just like regular relation pages.
+        // See: https://github.com/neondatabase/neon/issues/9855
+        if let Some(
+            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+        ) = metadata_record
+        {
+            let is_local_vm_page = |heap_blk| {
+                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+            };
+            // Send the old and new VM page updates to their respective shards.
+            clear_vm_bits.old_heap_blkno = clear_vm_bits
+                .old_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            clear_vm_bits.new_heap_blkno = clear_vm_bits
+                .new_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            // If neither VM page belongs to this shard, discard the record.
+            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
+                metadata_record = None
            }
        }
+
+        Ok(metadata_record)
    }

    fn decode_heapam_record(
--- a/libs/wal_decoder/src/lib.rs
+++ b/libs/wal_decoder/src/lib.rs
@@ -1,3 +1,4 @@
 pub mod decoder;
 pub mod models;
 pub mod serialized_batch;
+pub mod wire_format;
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -37,12 +37,32 @@ use utils::lsn::Lsn;

 use crate::serialized_batch::SerializedValueBatch;

+// Code generated by protobuf.
+pub mod proto {
+    // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
+    // we don't use these types for anything but broker data transmission,
+    // so it's ok to ignore this one.
+    #![allow(clippy::derive_partial_eq_without_eq)]
+    // The generated ValueMeta has a `len` method generate for its `len` field.
+    #![allow(clippy::len_without_is_empty)]
+    tonic::include_proto!("interpreted_wal");
+}
+
 #[derive(Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
    Yes,
    No,
 }

+/// A batch of interpreted WAL records
+#[derive(Serialize, Deserialize)]
+pub struct InterpretedWalRecords {
+    pub records: Vec<InterpretedWalRecord>,
+    // Start LSN of the next record after the batch.
+    // Note that said record may not belong to the current shard.
+    pub next_record_lsn: Option<Lsn>,
+}
+
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
 #[derive(Serialize, Deserialize)]
 pub struct InterpretedWalRecord {
@@ -65,6 +85,18 @@ pub struct InterpretedWalRecord {
    pub xid: TransactionId,
 }

+impl InterpretedWalRecord {
+    /// Checks if the WAL record is empty
+    ///
+    /// An empty interpreted WAL record has no data or metadata and does not have to be sent to the
+    /// pageserver.
+    pub fn is_empty(&self) -> bool {
+        self.batch.is_empty()
+            && self.metadata_record.is_none()
+            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
+    }
+}
+
 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
 #[derive(Serialize, Deserialize)]
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -496,11 +496,16 @@ impl SerializedValueBatch {
        }
    }

-    /// Checks if the batch is empty
-    ///
-    /// A batch is empty when it contains no serialized values.
-    /// Note that it may still contain observed values.
+    /// Checks if the batch contains any serialized or observed values
    pub fn is_empty(&self) -> bool {
+        !self.has_data() && self.metadata.is_empty()
+    }
+
+    /// Checks if the batch contains data
+    ///
+    /// Note that if this returns false, it may still contain observed values or
+    /// a metadata record.
+    pub fn has_data(&self) -> bool {
        let empty = self.raw.is_empty();

        if cfg!(debug_assertions) && empty {
@@ -510,7 +515,7 @@ impl SerializedValueBatch {
                .all(|meta| matches!(meta, ValueMeta::Observed(_))));
        }

-        empty
+        !empty
    }

    /// Returns the number of values serialized in the batch
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -0,0 +1,356 @@
+use bytes::{BufMut, Bytes, BytesMut};
+use pageserver_api::key::CompactKey;
+use prost::{DecodeError, EncodeError, Message};
+use tokio::io::AsyncWriteExt;
+use utils::bin_ser::{BeSer, DeserializeError, SerializeError};
+use utils::lsn::Lsn;
+use utils::postgres_client::{Compression, InterpretedFormat};
+
+use crate::models::{
+    FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord,
+};
+
+use crate::serialized_batch::{
+    ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta,
+};
+
+use crate::models::proto;
+
+#[derive(Debug, thiserror::Error)]
+pub enum ToWireFormatError {
+    #[error("{0}")]
+    Bincode(#[from] SerializeError),
+    #[error("{0}")]
+    Protobuf(#[from] ProtobufSerializeError),
+    #[error("{0}")]
+    Compression(#[from] std::io::Error),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ProtobufSerializeError {
+    #[error("{0}")]
+    MetadataRecord(#[from] SerializeError),
+    #[error("{0}")]
+    Encode(#[from] EncodeError),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum FromWireFormatError {
+    #[error("{0}")]
+    Bincode(#[from] DeserializeError),
+    #[error("{0}")]
+    Protobuf(#[from] ProtobufDeserializeError),
+    #[error("{0}")]
+    Decompress(#[from] std::io::Error),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ProtobufDeserializeError {
+    #[error("{0}")]
+    Transcode(#[from] TranscodeError),
+    #[error("{0}")]
+    Decode(#[from] DecodeError),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum TranscodeError {
+    #[error("{0}")]
+    BadInput(String),
+    #[error("{0}")]
+    MetadataRecord(#[from] DeserializeError),
+}
+
+pub trait ToWireFormat {
+    fn to_wire(
+        self,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> impl std::future::Future<Output = Result<Bytes, ToWireFormatError>> + Send;
+}
+
+pub trait FromWireFormat {
+    type T;
+    fn from_wire(
+        buf: &Bytes,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> impl std::future::Future<Output = Result<Self::T, FromWireFormatError>> + Send;
+}
+
+impl ToWireFormat for InterpretedWalRecords {
+    async fn to_wire(
+        self,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> Result<Bytes, ToWireFormatError> {
+        use async_compression::tokio::write::ZstdEncoder;
+        use async_compression::Level;
+
+        let encode_res: Result<Bytes, ToWireFormatError> = match format {
+            InterpretedFormat::Bincode => {
+                let buf = BytesMut::new();
+                let mut buf = buf.writer();
+                self.ser_into(&mut buf)?;
+                Ok(buf.into_inner().freeze())
+            }
+            InterpretedFormat::Protobuf => {
+                let proto: proto::InterpretedWalRecords = self.try_into()?;
+                let mut buf = BytesMut::new();
+                proto
+                    .encode(&mut buf)
+                    .map_err(|e| ToWireFormatError::Protobuf(e.into()))?;
+
+                Ok(buf.freeze())
+            }
+        };
+
+        let buf = encode_res?;
+        let compressed_buf = match compression {
+            Some(Compression::Zstd { level }) => {
+                let mut encoder = ZstdEncoder::with_quality(
+                    Vec::with_capacity(buf.len() / 4),
+                    Level::Precise(level as i32),
+                );
+                encoder.write_all(&buf).await?;
+                encoder.shutdown().await?;
+                Bytes::from(encoder.into_inner())
+            }
+            None => buf,
+        };
+
+        Ok(compressed_buf)
+    }
+}
+
+impl FromWireFormat for InterpretedWalRecords {
+    type T = Self;
+
+    async fn from_wire(
+        buf: &Bytes,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> Result<Self, FromWireFormatError> {
+        let decompressed_buf = match compression {
+            Some(Compression::Zstd { .. }) => {
+                use async_compression::tokio::write::ZstdDecoder;
+                let mut decoded_buf = Vec::with_capacity(buf.len());
+                let mut decoder = ZstdDecoder::new(&mut decoded_buf);
+                decoder.write_all(buf).await?;
+                decoder.flush().await?;
+                Bytes::from(decoded_buf)
+            }
+            None => buf.clone(),
+        };
+
+        match format {
+            InterpretedFormat::Bincode => {
+                InterpretedWalRecords::des(&decompressed_buf).map_err(FromWireFormatError::Bincode)
+            }
+            InterpretedFormat::Protobuf => {
+                let proto = proto::InterpretedWalRecords::decode(decompressed_buf)
+                    .map_err(|e| FromWireFormatError::Protobuf(e.into()))?;
+                InterpretedWalRecords::try_from(proto)
+                    .map_err(|e| FromWireFormatError::Protobuf(e.into()))
+            }
+        }
+    }
+}
+
+impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
+    type Error = SerializeError;
+
+    fn try_from(value: InterpretedWalRecords) -> Result<Self, Self::Error> {
+        let records = value
+            .records
+            .into_iter()
+            .map(proto::InterpretedWalRecord::try_from)
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(proto::InterpretedWalRecords {
+            records,
+            next_record_lsn: value.next_record_lsn.map(|l| l.0),
+        })
+    }
+}
+
+impl TryFrom<InterpretedWalRecord> for proto::InterpretedWalRecord {
+    type Error = SerializeError;
+
+    fn try_from(value: InterpretedWalRecord) -> Result<Self, Self::Error> {
+        let metadata_record = value
+            .metadata_record
+            .map(|meta_rec| -> Result<Vec<u8>, Self::Error> {
+                let mut buf = Vec::new();
+                meta_rec.ser_into(&mut buf)?;
+                Ok(buf)
+            })
+            .transpose()?;
+
+        Ok(proto::InterpretedWalRecord {
+            metadata_record,
+            batch: Some(proto::SerializedValueBatch::from(value.batch)),
+            next_record_lsn: value.next_record_lsn.0,
+            flush_uncommitted: matches!(value.flush_uncommitted, FlushUncommittedRecords::Yes),
+            xid: value.xid,
+        })
+    }
+}
+
+impl From<SerializedValueBatch> for proto::SerializedValueBatch {
+    fn from(value: SerializedValueBatch) -> Self {
+        proto::SerializedValueBatch {
+            raw: value.raw,
+            metadata: value
+                .metadata
+                .into_iter()
+                .map(proto::ValueMeta::from)
+                .collect(),
+            max_lsn: value.max_lsn.0,
+            len: value.len as u64,
+        }
+    }
+}
+
+impl From<ValueMeta> for proto::ValueMeta {
+    fn from(value: ValueMeta) -> Self {
+        match value {
+            ValueMeta::Observed(obs) => proto::ValueMeta {
+                r#type: proto::ValueMetaType::Observed.into(),
+                key: Some(proto::CompactKey::from(obs.key)),
+                lsn: obs.lsn.0,
+                batch_offset: None,
+                len: None,
+                will_init: None,
+            },
+            ValueMeta::Serialized(ser) => proto::ValueMeta {
+                r#type: proto::ValueMetaType::Serialized.into(),
+                key: Some(proto::CompactKey::from(ser.key)),
+                lsn: ser.lsn.0,
+                batch_offset: Some(ser.batch_offset),
+                len: Some(ser.len as u64),
+                will_init: Some(ser.will_init),
+            },
+        }
+    }
+}
+
+impl From<CompactKey> for proto::CompactKey {
+    fn from(value: CompactKey) -> Self {
+        proto::CompactKey {
+            high: (value.raw() >> 64) as i64,
+            low: value.raw() as i64,
+        }
+    }
+}
+
+impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::InterpretedWalRecords) -> Result<Self, Self::Error> {
+        let records = value
+            .records
+            .into_iter()
+            .map(InterpretedWalRecord::try_from)
+            .collect::<Result<_, _>>()?;
+
+        Ok(InterpretedWalRecords {
+            records,
+            next_record_lsn: value.next_record_lsn.map(Lsn::from),
+        })
+    }
+}
+
+impl TryFrom<proto::InterpretedWalRecord> for InterpretedWalRecord {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::InterpretedWalRecord) -> Result<Self, Self::Error> {
+        let metadata_record = value
+            .metadata_record
+            .map(|mrec| -> Result<_, DeserializeError> { MetadataRecord::des(&mrec) })
+            .transpose()?;
+
+        let batch = {
+            let batch = value.batch.ok_or_else(|| {
+                TranscodeError::BadInput("InterpretedWalRecord::batch missing".to_string())
+            })?;
+
+            SerializedValueBatch::try_from(batch)?
+        };
+
+        Ok(InterpretedWalRecord {
+            metadata_record,
+            batch,
+            next_record_lsn: Lsn(value.next_record_lsn),
+            flush_uncommitted: if value.flush_uncommitted {
+                FlushUncommittedRecords::Yes
+            } else {
+                FlushUncommittedRecords::No
+            },
+            xid: value.xid,
+        })
+    }
+}
+
+impl TryFrom<proto::SerializedValueBatch> for SerializedValueBatch {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::SerializedValueBatch) -> Result<Self, Self::Error> {
+        let metadata = value
+            .metadata
+            .into_iter()
+            .map(ValueMeta::try_from)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(SerializedValueBatch {
+            raw: value.raw,
+            metadata,
+            max_lsn: Lsn(value.max_lsn),
+            len: value.len as usize,
+        })
+    }
+}
+
+impl TryFrom<proto::ValueMeta> for ValueMeta {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::ValueMeta) -> Result<Self, Self::Error> {
+        match proto::ValueMetaType::try_from(value.r#type) {
+            Ok(proto::ValueMetaType::Serialized) => {
+                Ok(ValueMeta::Serialized(SerializedValueMeta {
+                    key: value
+                        .key
+                        .ok_or_else(|| {
+                            TranscodeError::BadInput("ValueMeta::key missing".to_string())
+                        })?
+                        .into(),
+                    lsn: Lsn(value.lsn),
+                    batch_offset: value.batch_offset.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::batch_offset missing".to_string())
+                    })?,
+                    len: value.len.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::len missing".to_string())
+                    })? as usize,
+                    will_init: value.will_init.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::will_init missing".to_string())
+                    })?,
+                }))
+            }
+            Ok(proto::ValueMetaType::Observed) => Ok(ValueMeta::Observed(ObservedValueMeta {
+                key: value
+                    .key
+                    .ok_or_else(|| TranscodeError::BadInput("ValueMeta::key missing".to_string()))?
+                    .into(),
+                lsn: Lsn(value.lsn),
+            })),
+            Err(_) => Err(TranscodeError::BadInput(format!(
+                "Unexpected ValueMeta::type {}",
+                value.r#type
+            ))),
+        }
+    }
+}
+
+impl From<proto::CompactKey> for CompactKey {
+    fn from(value: proto::CompactKey) -> Self {
+        (((value.high as i128) << 64) | (value.low as i128)).into()
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
+    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");

    // The tenants directory contains all the pageserver local disk state.
    // Create if not exists and make sure all the contents are durable before proceeding.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
 use utils::logging::SecretString;
+use utils::postgres_client::PostgresClientProtocol;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -190,6 +191,8 @@ pub struct PageServerConf {
    /// Maximum amount of time for which a get page request request
    /// might be held up for request merging.
    pub server_side_batch_timeout: Option<Duration>,
+
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }

 /// Token for authentication to safekeepers
@@ -350,6 +353,7 @@ impl PageServerConf {
            server_side_batch_timeout,
            tenant_config,
            no_sync,
+            wal_receiver_protocol,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -393,6 +397,7 @@ impl PageServerConf {
            import_pgdata_upcall_api,
            import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
            import_pgdata_aws_endpoint_url,
+            wal_receiver_protocol,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
        cancel: CancellationToken,
+        executed: Arc<AtomicUsize>,
    }

    impl ConsumerState {
-        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
-            let mut executed = 0;
-
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
            info!("Executing all pending deletions");

            // Transform all executor messages to generic frontend messages
-            while let Ok(msg) = self.executor_rx.try_recv() {
+            loop {
+                use either::Either;
+                let msg = tokio::select! {
+                    left = self.executor_rx.recv() => Either::Left(left),
+                    right = self.rx.recv() => Either::Right(right),
+                };
                match msg {
-                    DeleterMessage::Delete(objects) => {
+                    Either::Left(None) => break,
+                    Either::Right(None) => break,
+                    Either::Left(Some(DeleterMessage::Delete(objects))) => {
                        for path in objects {
                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
                                    error!("Failed to delete {path}, leaking object! ({e})");
                                }
                            }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                        }
                    }
-                    DeleterMessage::Flush(flush_op) => {
+                    Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
                        flush_op.notify();
                    }
-                }
-            }
-
-            while let Ok(msg) = self.rx.try_recv() {
-                match msg {
-                    ListWriterQueueMessage::Delete(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
                        let mut objects = op.objects;
                        for (layer, meta) in op.layers {
                            objects.push(remote_layer_path(
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
                                    error!("Failed to delete {path}, leaking object! ({e})");
                                }
                            }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                        }
                    }
-                    ListWriterQueueMessage::Flush(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
                        op.notify();
                    }
-                    ListWriterQueueMessage::FlushExecute(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
                        // We have already executed all prior deletions because mock does them inline
                        op.notify();
                    }
-                    ListWriterQueueMessage::Recover(_) => {
+                    Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
                        // no-op in mock
                    }
                }
-                info!("All pending deletions have been executed");
            }
-
-            executed
        }
    }

    pub struct MockDeletionQueue {
        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        executed: Arc<AtomicUsize>,
-        remote_storage: Option<GenericRemoteStorage>,
-        consumer: std::sync::Mutex<ConsumerState>,
        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
    }

@@ -1235,29 +1230,34 @@ pub(crate) mod mock {

            let executed = Arc::new(AtomicUsize::new(0));

+            let mut consumer = ConsumerState {
+                rx,
+                executor_rx,
+                cancel: CancellationToken::new(),
+                executed: executed.clone(),
+            };
+
+            tokio::spawn(async move {
+                if let Some(remote_storage) = &remote_storage {
+                    consumer.consume(remote_storage).await;
+                }
+            });
+
            Self {
                tx,
                executor_tx,
-                executed,
-                remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState {
-                    rx,
-                    executor_rx,
-                    cancel: CancellationToken::new(),
-                }),
                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
            }
        }

        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
-            if let Some(remote_storage) = &self.remote_storage {
-                // Permit holding mutex across await, because this is only ever
-                // called once at a time in tests.
-                let mut locked = self.consumer.lock().unwrap();
-                let count = locked.consume(remote_storage).await;
-                self.executed.fetch_add(count, Ordering::Relaxed);
-            }
+            let (tx, rx) = tokio::sync::oneshot::channel();
+            self.executor_tx
+                .send(DeleterMessage::Flush(FlushOp { tx }))
+                .await
+                .expect("Failed to send flush message");
+            rx.await.ok();
        }

        pub(crate) fn new_client(&self) -> DeletionQueueClient {
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
@@ -653,6 +662,35 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
    .expect("failed to define a metric")
 });

+pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_relsize_cache_entries",
+        "Number of entries in the relation size cache",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
+        .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses",
+        "Relation size cache misses",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses_old",
+        "Relation size cache misses where the lookup LSN is older than the last relation update"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -2106,6 +2144,7 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
+    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2134,6 +2173,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Total number of zero gap blocks written on relation extends"
    )
    .expect("failed to define a metric"),
+    clear_vm_bits_unknown: register_int_counter_vec!(
+        "pageserver_wal_ingest_clear_vm_bits_unknown",
+        "Number of ignored ClearVmBits operations due to unknown pages/relations",
+        &["entity"],
+    )
+    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2336,6 +2381,7 @@ pub(crate) struct TimelineMetrics {
    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2379,6 +2425,9 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -2516,6 +2565,7 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
            flush_time_histo,
+            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -2563,6 +2613,14 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2579,6 +2637,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,6 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -389,7 +392,9 @@ impl Timeline {
        result
    }

-    // Get size of a database in blocks
+    /// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
+    /// other shards, by only accounting for relations the shard has pages for, and only accounting
+    /// for pages up to the highest page number it has stored.
    pub(crate) async fn get_db_size(
        &self,
        spcnode: Oid,
@@ -408,7 +413,10 @@ impl Timeline {
        Ok(total_blocks)
    }

-    /// Get size of a relation file
+    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
+    ///
+    /// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+    /// page number stored in the shard.
    pub(crate) async fn get_rel_size(
        &self,
        tag: RelTag,
@@ -444,7 +452,10 @@ impl Timeline {
        Ok(nblocks)
    }

-    /// Does relation exist?
+    /// Does the relation exist?
+    ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
    pub(crate) async fn get_rel_exists(
        &self,
        tag: RelTag,
@@ -478,6 +489,9 @@ impl Timeline {

    /// Get a list of all existing relations in given tablespace and database.
    ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
+    ///
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
@@ -1129,9 +1143,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
+                RELSIZE_CACHE_HITS.inc();
                return Some(*nblocks);
            }
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

@@ -1156,6 +1173,7 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }
@@ -1163,13 +1181,17 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.insert(tag, (lsn, nblocks));
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.remove(tag);
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
    }
 }

@@ -1229,10 +1251,9 @@ impl<'a> DatadirModification<'a> {
    }

    pub(crate) fn has_dirty_data(&self) -> bool {
-        !self
-            .pending_data_batch
+        self.pending_data_batch
            .as_ref()
-            .map_or(true, |b| b.is_empty())
+            .map_or(false, |b| b.has_data())
    }

    /// Set the current lsn
@@ -1408,7 +1429,7 @@ impl<'a> DatadirModification<'a> {
            Some(pending_batch) => {
                pending_batch.extend(batch);
            }
-            None if !batch.is_empty() => {
+            None if batch.has_data() => {
                self.pending_data_batch = Some(batch);
            }
            None => {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3215,6 +3215,18 @@ impl Tenant {
            }
        }

+        if let ShutdownMode::Reload = shutdown_mode {
+            tracing::info!("Flushing deletion queue");
+            if let Err(e) = self.deletion_queue_client.flush().await {
+                match e {
+                    DeletionQueueError::ShuttingDown => {
+                        // This is the only error we expect for now. In the future, if more error
+                        // variants are added, we should handle them here.
+                    }
+                }
+            }
+        }
+
        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
        // them to continue to do work during their shutdown methods, e.g. flushing data.
        tracing::debug!("Cancelling CancellationToken");
@@ -5344,6 +5356,7 @@ pub(crate) mod harness {
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
+                wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
            }
        }
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -19,6 +19,7 @@ use serde_json::Value;
 use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
+use utils::postgres_client::PostgresClientProtocol;

 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
@@ -353,6 +354,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub timeline_offloading: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }

 impl TenantConfOpt {
@@ -418,6 +422,9 @@ impl TenantConfOpt {
            timeline_offloading: self
                .lazy_slru_download
                .unwrap_or(global_conf.timeline_offloading),
+            wal_receiver_protocol_override: self
+                .wal_receiver_protocol_override
+                .or(global_conf.wal_receiver_protocol_override),
        }
    }
 }
@@ -472,6 +479,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
            timeline_offloading: value.timeline_offloading,
+            wal_receiver_protocol_override: value.wal_receiver_protocol_override,
        }
    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1960,7 +1960,7 @@ impl TenantManager {
            attempt.before_reset_tenant();

            let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Flush).await {
+            match tenant.shutdown(progress, ShutdownMode::Reload).await {
                Ok(()) => {
                    slot_guard.drop_old_value().expect("it was just shutdown");
                }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -50,6 +50,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
    fs_ext, pausable_failpoint,
+    postgres_client::PostgresClientProtocol,
    sync::gate::{Gate, GateGuard},
 };
 use wal_decoder::serialized_batch::SerializedValueBatch;
@@ -893,10 +894,11 @@ pub(crate) enum ShutdownMode {
    /// While we are flushing, we continue to accept read I/O for LSNs ingested before
    /// the call to [`Timeline::shutdown`].
    FreezeAndFlush,
-    /// Only flush the layers to the remote storage without freezing any open layers. This is the
-    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
-    /// the generation number.
-    Flush,
+    /// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
+    /// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
+    /// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
+    /// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
+    Reload,
    /// Shut down immediately, without waiting for any open layers to flush.
    Hard,
 }
@@ -1817,7 +1819,7 @@ impl Timeline {
            }
        }

-        if let ShutdownMode::Flush = mode {
+        if let ShutdownMode::Reload = mode {
            // drain the upload queue
            self.remote_client.shutdown().await;
            if !self.remote_client.no_pending_work() {
@@ -2178,6 +2180,21 @@ impl Timeline {
            )
    }

+    /// Resolve the effective WAL receiver protocol to use for this tenant.
+    ///
+    /// Priority order is:
+    /// 1. Tenant config override
+    /// 2. Default value for tenant config override
+    /// 3. Pageserver config override
+    /// 4. Pageserver config default
+    pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .wal_receiver_protocol_override
+            .or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
+            .unwrap_or(self.conf.wal_receiver_protocol)
+    }
+
    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -2470,6 +2487,7 @@ impl Timeline {
        *guard = Some(WalReceiver::start(
            Arc::clone(self),
            WalReceiverConf {
+                protocol: self.resolve_wal_receiver_protocol(),
                wal_connect_timeout,
                lagging_wal_timeout,
                max_lsn_wal_lag,
@@ -3829,7 +3847,8 @@ impl Timeline {
        };

        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
        self.remote_client
            .wait_completion()
            .await
@@ -3842,6 +3861,8 @@ impl Timeline {
                    FlushLayerError::Other(anyhow!(e).into())
                }
            })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);

        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
@@ -5896,7 +5917,7 @@ impl<'a> TimelineWriter<'a> {
        batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        if batch.is_empty() {
+        if !batch.has_data() {
            return Ok(());
        }

--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
    }

    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Flush).await;
+    timeline.shutdown(super::ShutdownMode::Reload).await;

    // TODO extend guard mechanism above with method
    // to make deletions possible while offloading is in progress
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::postgres_client::PostgresClientProtocol;

 use self::connection_manager::ConnectionManagerStatus;

@@ -45,6 +46,7 @@ use super::Timeline;

 #[derive(Clone)]
 pub struct WalReceiverConf {
+    pub protocol: PostgresClientProtocol,
    /// The timeout on the connection to safekeeper for WAL streaming.
    pub wal_connect_timeout: Duration,
    /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
+use utils::postgres_client::{
+    wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
@@ -533,6 +535,7 @@ impl ConnectionManagerState {
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let ingest_batch_size = self.conf.ingest_batch_size;
+        let protocol = self.conf.protocol;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -546,6 +549,7 @@ impl ConnectionManagerState {

                let res = super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
+                    protocol,
                    new_sk.wal_source_connconf,
                    events_sender,
                    cancellation.clone(),
@@ -984,15 +988,33 @@ impl ConnectionManagerState {
                if info.safekeeper_connstr.is_empty() {
                    return None; // no connection string, ignore sk
                }
-                match wal_stream_connection_config(
-                    self.id,
-                    info.safekeeper_connstr.as_ref(),
-                    match &self.conf.auth_token {
-                        None => None,
-                        Some(x) => Some(x),
+
+                let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
+                    PostgresClientProtocol::Vanilla => {
+                        (None, None, None)
                    },
-                    self.conf.availability_zone.as_deref(),
-                ) {
+                    PostgresClientProtocol::Interpreted { .. } => {
+                        let shard_identity = self.timeline.get_shard_identity();
+                        (
+                            Some(shard_identity.number.0),
+                            Some(shard_identity.count.0),
+                            Some(shard_identity.stripe_size.0),
+                        )
+                    }
+                };
+
+                let connection_conf_args = ConnectionConfigArgs {
+                    protocol: self.conf.protocol,
+                    ttid: self.id,
+                    shard_number,
+                    shard_count,
+                    shard_stripe_size,
+                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                    availability_zone: self.conf.availability_zone.as_deref()
+                };
+
+                match wal_stream_connection_config(connection_conf_args) {
                    Ok(connstr) => Some((*sk_id, info, connstr)),
                    Err(e) => {
                        error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
@@ -1096,6 +1118,7 @@ impl ReconnectReason {
 mod tests {
    use super::*;
    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
    use url::Host;

    fn dummy_broker_sk_timeline(
@@ -1532,6 +1555,7 @@ mod tests {
            timeline,
            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
+                protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,10 @@ use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
-use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
+use wal_decoder::{
+    models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
+    wire_format::FromWireFormat,
+};

 use super::TaskStateUpdate;
 use crate::{
@@ -36,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn};
+use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};

 /// Status of the connection.
@@ -109,6 +112,7 @@ impl From<WalDecodeError> for WalReceiverError {
 #[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
+    protocol: PostgresClientProtocol,
    wal_source_connconf: PgConnectionConfig,
    events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
    cancellation: CancellationToken,
@@ -260,6 +264,14 @@ pub(super) async fn handle_walreceiver_connection(

    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;

+    let interpreted_proto_config = match protocol {
+        PostgresClientProtocol::Vanilla => None,
+        PostgresClientProtocol::Interpreted {
+            format,
+            compression,
+        } => Some((format, compression)),
+    };
+
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
@@ -291,6 +303,15 @@ pub(super) async fn handle_walreceiver_connection(
                connection_status.latest_connection_update = now;
                connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
            }
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                connection_status.latest_connection_update = now;
+                if !raw.data().is_empty() {
+                    connection_status.latest_wal_update = now;
+                }
+
+                connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
+                connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
+            }
            &_ => {}
        };
        if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
@@ -298,7 +319,144 @@ pub(super) async fn handle_walreceiver_connection(
            return Ok(());
        }

+        async fn commit(
+            modification: &mut DatadirModification<'_>,
+            uncommitted: &mut u64,
+            filtered: &mut u64,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<()> {
+            WAL_INGEST
+                .records_committed
+                .inc_by(*uncommitted - *filtered);
+            modification.commit(ctx).await?;
+            *uncommitted = 0;
+            *filtered = 0;
+            Ok(())
+        }
+
        let status_update = match replication_message {
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
+
+                let mut uncommitted_records = 0;
+                let mut filtered_records = 0;
+
+                // This is the end LSN of the raw WAL from which the records
+                // were interpreted.
+                let streaming_lsn = Lsn::from(raw.streaming_lsn());
+
+                let (format, compression) = interpreted_proto_config.unwrap();
+                let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
+                    .await
+                    .with_context(|| {
+                        anyhow::anyhow!(
+                        "Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
+                    )
+                    })?;
+
+                let InterpretedWalRecords {
+                    records,
+                    next_record_lsn,
+                } = batch;
+
+                tracing::debug!(
+                    "Received WAL up to {} with next_record_lsn={:?}",
+                    streaming_lsn,
+                    next_record_lsn
+                );
+
+                // We start the modification at 0 because each interpreted record
+                // advances it to its end LSN. 0 is just an initialization placeholder.
+                let mut modification = timeline.begin_modification(Lsn(0));
+
+                for interpreted in records {
+                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
+                        && uncommitted_records > 0
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+
+                    let local_next_record_lsn = interpreted.next_record_lsn;
+                    let ingested = walingest
+                        .ingest_record(interpreted, &mut modification, &ctx)
+                        .await
+                        .with_context(|| {
+                            format!("could not ingest record at {local_next_record_lsn}")
+                        })?;
+
+                    if !ingested {
+                        tracing::debug!(
+                            "ingest: filtered out record @ LSN {local_next_record_lsn}"
+                        );
+                        WAL_INGEST.records_filtered.inc();
+                        filtered_records += 1;
+                    }
+
+                    uncommitted_records += 1;
+
+                    // FIXME: this cannot be made pausable_failpoint without fixing the
+                    // failpoint library; in tests, the added amount of debugging will cause us
+                    // to timeout the tests.
+                    fail_point!("walreceiver-after-ingest");
+
+                    // Commit every ingest_batch_size records. Even if we filtered out
+                    // all records, we still need to call commit to advance the LSN.
+                    if uncommitted_records >= ingest_batch_size
+                        || modification.approx_pending_bytes()
+                            > DatadirModification::MAX_PENDING_BYTES
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+                }
+
+                // Records might have been filtered out on the safekeeper side, but we still
+                // need to advance last record LSN on all shards. If we've not ingested the latest
+                // record, then set the LSN of the modification past it. This way all shards
+                // advance their last record LSN at the same time.
+                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                    Some(lsn) if lsn > modification.get_lsn() => {
+                        modification.set_lsn(lsn).unwrap();
+                        true
+                    }
+                    _ => false,
+                };
+
+                if uncommitted_records > 0 || needs_last_record_lsn_advance {
+                    // Commit any uncommitted records
+                    commit(
+                        &mut modification,
+                        &mut uncommitted_records,
+                        &mut filtered_records,
+                        &ctx,
+                    )
+                    .await?;
+                }
+
+                if !caught_up && streaming_lsn >= end_of_wal {
+                    info!("caught up at LSN {streaming_lsn}");
+                    caught_up = true;
+                }
+
+                tracing::debug!(
+                    "Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
+                    timeline.get_last_record_lsn()
+                );
+
+                Some(streaming_lsn)
+            }
+
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
                // more records as a result.
@@ -316,21 +474,6 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut uncommitted_records = 0;
                    let mut filtered_records = 0;

-                    async fn commit(
-                        modification: &mut DatadirModification<'_>,
-                        uncommitted: &mut u64,
-                        filtered: &mut u64,
-                        ctx: &RequestContext,
-                    ) -> anyhow::Result<()> {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(*uncommitted - *filtered);
-                        modification.commit(ctx).await?;
-                        *uncommitted = 0;
-                        *filtered = 0;
-                        Ok(())
-                    }
-
                    while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
    }
 }

-impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
    type Target = [u8; N];

    fn deref(&self) -> &Self::Target {
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
    }
 }

-impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        self.buf
    }
 }

-impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
    fn as_ref(&self) -> &[u8; N] {
        self.buf
    }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,14 +334,32 @@ impl WalIngest {
        // replaying it would fail to find the previous image of the page, because
        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
        // record if it doesn't.
-        let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+        //
+        // TODO: analyze the metrics and tighten this up accordingly. This logic
+        // implicitly assumes that VM pages see explicit WAL writes before
+        // implicit ClearVmBits, and will otherwise silently drop updates.
+        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
+            WAL_INGEST
+                .clear_vm_bits_unknown
+                .with_label_values(&["relation"])
+                .inc();
+            return Ok(());
+        };
        if let Some(blknum) = new_vm_blk {
            if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["new_page"])
+                    .inc();
                new_vm_blk = None;
            }
        }
        if let Some(blknum) = old_vm_blk {
            if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["old_page"])
+                    .inc();
                old_vm_blk = None;
            }
        }
@@ -572,7 +590,8 @@ impl WalIngest {
                modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                fsm_physical_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > fsm_physical_page_no {
                // check if something to do: FSM is larger than truncate position
                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -612,7 +631,8 @@ impl WalIngest {
                )?;
                vm_page_no += 1;
            }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
            if nblocks > vm_page_no {
                // check if something to do: VM is larger than truncate position
                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1430,24 +1450,27 @@ impl WalIngest {
    }
 }

+/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
+///
+/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+/// page number stored in the shard, or None if the shard does not have any pages for it.
 async fn get_relsize(
    modification: &DatadirModification<'_>,
    rel: RelTag,
    ctx: &RequestContext,
-) -> Result<BlockNumber, PageReconstructError> {
-    let nblocks = if !modification
+) -> Result<Option<BlockNumber>, PageReconstructError> {
+    if !modification
        .tline
        .get_rel_exists(rel, Version::Modified(modification), ctx)
        .await?
    {
-        0
-    } else {
-        modification
-            .tline
-            .get_rel_size(rel, Version::Modified(modification), ctx)
-            .await?
-    };
-    Ok(nblocks)
+        return Ok(None);
+    }
+    modification
+        .tline
+        .get_rel_size(rel, Version::Modified(modification), ctx)
+        .await
+        .map(Some)
 }

 #[allow(clippy::bool_assert_comparison)]
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -20,7 +20,7 @@

 #define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */

-static int	logical_replication_max_snap_files = 300;
+static int	logical_replication_max_snap_files = 10000;

 /*
 * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
@@ -184,7 +184,7 @@ InitLogicalReplicationMonitor(void)
 							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
+							10000, -1, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,8 @@
-use std::net::SocketAddr;
+use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;

 use dashmap::DashMap;
+use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use pq_proto::CancelKeyData;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -17,9 +18,6 @@ use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
    CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
-use std::net::IpAddr;
-
-use ipnet::{IpNet, Ipv4Net, Ipv6Net};

 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;

-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, Instrument};
@@ -88,40 +88,37 @@ pub async fn task_main(
                crate::metrics::Protocol::Tcp,
                &config.region,
            );
-            let span = ctx.span();

-            let startup = Box::pin(
-                handle_client(
-                    config,
-                    backend,
-                    &ctx,
-                    cancellation_handler,
-                    socket,
-                    conn_gauge,
-                )
-                .instrument(span.clone()),
-            );
-            let res = startup.await;
+            let res = handle_client(
+                config,
+                backend,
+                &ctx,
+                cancellation_handler,
+                socket,
+                conn_gauge,
+            )
+            .instrument(ctx.span())
+            .boxed()
+            .await;

            match res {
                Err(e) => {
-                    // todo: log and push to ctx the error kind
                    ctx.set_error_kind(e.get_error_kind());
-                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                    error!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
                }
                Ok(None) => {
                    ctx.set_success();
                }
                Ok(Some(p)) => {
                    ctx.set_success();
-                    ctx.log_connect();
-                    match p.proxy_pass().instrument(span.clone()).await {
+                    let _disconnect = ctx.log_connect();
+                    match p.proxy_pass().await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
                        }
                        Err(ErrorSource::Compute(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
                        }
                    }
                }
@@ -219,6 +216,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        client: stream,
        aux: node.aux.clone(),
        compute: node,
+        session_id: ctx.session_id(),
        _req: request_gauge,
        _conn: conn_gauge,
        _cancel: session,
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{debug, info_span, Span};
+use tracing::{debug, error, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;

@@ -272,11 +272,14 @@ impl RequestContext {
        this.success = true;
    }

-    pub fn log_connect(&self) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .log_connect();
+    pub fn log_connect(self) -> DisconnectLogger {
+        let mut this = self.0.into_inner();
+        this.log_connect();
+
+        // close current span.
+        this.span = Span::none();
+
+        DisconnectLogger(this)
    }

    pub(crate) fn protocol(&self) -> Protocol {
@@ -412,9 +415,11 @@ impl RequestContextInner {
                });
        }
        if let Some(tx) = self.sender.take() {
-            tx.send(RequestData::from(&*self))
-                .inspect_err(|e| debug!("tx send failed: {e}"))
-                .ok();
+            // If type changes, this error handling needs to be updated.
+            let tx: mpsc::UnboundedSender<RequestData> = tx;
+            if let Err(e) = tx.send(RequestData::from(&*self)) {
+                error!("log_connect channel send failed: {e}");
+            }
        }
    }

@@ -423,9 +428,11 @@ impl RequestContextInner {
        // Here we log the length of the session.
        self.disconnect_timestamp = Some(Utc::now());
        if let Some(tx) = self.disconnect_sender.take() {
-            tx.send(RequestData::from(&*self))
-                .inspect_err(|e| debug!("tx send failed: {e}"))
-                .ok();
+            // If type changes, this error handling needs to be updated.
+            let tx: mpsc::UnboundedSender<RequestData> = tx;
+            if let Err(e) = tx.send(RequestData::from(&*self)) {
+                error!("log_disconnect channel send failed: {e}");
+            }
        }
    }
 }
@@ -434,8 +441,14 @@ impl Drop for RequestContextInner {
    fn drop(&mut self) {
        if self.sender.is_some() {
            self.log_connect();
-        } else {
-            self.log_disconnect();
        }
    }
 }
+
+pub struct DisconnectLogger(RequestContextInner);
+
+impl Drop for DisconnectLogger {
+    fn drop(&mut self) {
+        self.0.log_disconnect();
+    }
+}
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -398,7 +398,7 @@ async fn upload_parquet(
    .err();

    if let Some(err) = maybe_err {
-        tracing::warn!(%id, %err, "failed to upload request data");
+        tracing::error!(%id, error = ?err, "failed to upload request data");
    }

    Ok(buffer.writer())
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -114,7 +114,7 @@ impl MockControlPlane {

            Ok((secret, allowed_ips))
        }
-        .map_err(crate::error::log_error::<GetAuthInfoError>)
+        .inspect_err(|e: &GetAuthInfoError| tracing::error!("{e}"))
        .instrument(info_span!("postgres", url = self.endpoint.as_str()))
        .await?;
        Ok(AuthInfo {
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -134,8 +134,8 @@ impl NeonControlPlaneClient {
                project_id: body.project_id,
            })
        }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
        .await
    }

@@ -193,8 +193,8 @@ impl NeonControlPlaneClient {

            Ok(rules)
        }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
        .await
    }

@@ -252,9 +252,8 @@ impl NeonControlPlaneClient {

            Ok(node)
        }
-        .map_err(crate::error::log_error)
-        // TODO: redo this span stuff
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
        .await
    }
 }
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -10,12 +10,6 @@ pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Err
    io::Error::new(io::ErrorKind::Other, e)
 }

-/// A small combinator for pluggable error logging.
-pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
-    tracing::error!("{e}");
-    e
-}
-
 /// Marks errors that may be safely shown to a client.
 /// This trait can be seen as a specialized version of [`ToString`].
 ///
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -10,7 +10,7 @@ pub(crate) mod wake_compute;
 use std::sync::Arc;

 pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
@@ -123,42 +123,39 @@ pub async fn task_main(
                crate::metrics::Protocol::Tcp,
                &config.region,
            );
-            let span = ctx.span();

-            let startup = Box::pin(
-                handle_client(
-                    config,
-                    auth_backend,
-                    &ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter2,
-                    conn_gauge,
-                )
-                .instrument(span.clone()),
-            );
-            let res = startup.await;
+            let res = handle_client(
+                config,
+                auth_backend,
+                &ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter2,
+                conn_gauge,
+            )
+            .instrument(ctx.span())
+            .boxed()
+            .await;

            match res {
                Err(e) => {
-                    // todo: log and push to ctx the error kind
                    ctx.set_error_kind(e.get_error_kind());
-                    warn!(parent: &span, "per-client task finished with an error: {e:#}");
+                    warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
                }
                Ok(None) => {
                    ctx.set_success();
                }
                Ok(Some(p)) => {
                    ctx.set_success();
-                    ctx.log_connect();
-                    match p.proxy_pass().instrument(span.clone()).await {
+                    let _disconnect = ctx.log_connect();
+                    match p.proxy_pass().await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
-                            warn!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                            warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
                        }
                        Err(ErrorSource::Compute(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
                        }
                    }
                }
@@ -352,6 +349,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        client: stream,
        aux: node.aux.clone(),
        compute: node,
+        session_id: ctx.session_id(),
        _req: request_gauge,
        _conn: conn_gauge,
        _cancel: session,
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -59,6 +59,7 @@ pub(crate) struct ProxyPassthrough<P, S> {
    pub(crate) client: Stream<S>,
    pub(crate) compute: PostgresConnection,
    pub(crate) aux: MetricsAuxInfo,
+    pub(crate) session_id: uuid::Uuid,

    pub(crate) _req: NumConnectionRequestsGuard<'static>,
    pub(crate) _conn: NumClientConnectionsGuard<'static>,
@@ -69,7 +70,7 @@ impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
        if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
-            tracing::warn!(?err, "could not cancel the query in the database");
+            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
        }
        res
    }
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,9 @@
-use tracing::{error, info, warn};
+use tracing::{error, info};

 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestContext;
-use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::errors::{ControlPlaneError, WakeComputeError};
 use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
 use crate::metrics::{
@@ -11,6 +11,18 @@ use crate::metrics::{
 };
 use crate::proxy::retry::{retry_after, should_retry};

+// Use macro to retain original callsite.
+macro_rules! log_wake_compute_error {
+    (error = ?$error:expr, $num_retries:expr, retriable = $retriable:literal) => {
+        match $error {
+            WakeComputeError::ControlPlane(ControlPlaneError::Message(_)) => {
+                info!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node")
+            }
+            _ => error!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node"),
+        }
+    };
+}
+
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
    num_retries: &mut u32,
    ctx: &RequestContext,
@@ -20,7 +32,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
    loop {
        match api.wake_compute(ctx).await {
            Err(e) if !should_retry(&e, *num_retries, config) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = false);
                report_error(&e, false);
                Metrics::get().proxy.retries_metric.observe(
                    RetriesMetricGroup {
@@ -32,7 +44,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                return Err(e);
            }
            Err(e) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = true);
                report_error(&e, true);
            }
            Ok(n) => {
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,6 +1,6 @@
+use core::net::IpAddr;
 use std::sync::Arc;

-use core::net::IpAddr;
 use pq_proto::CancelKeyData;
 use redis::AsyncCommands;
 use tokio::sync::Mutex;
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -28,6 +28,7 @@ hyper0.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
+pageserver_api.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 pprof.workspace = true
@@ -58,6 +59,7 @@ sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
+wal_decoder.workspace = true

 workspace_hack.workspace = true

--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,11 +2,15 @@
 //! protocol commands.

 use anyhow::Context;
+use pageserver_api::models::ShardParameters;
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
+use utils::postgres_client::PostgresClientProtocol;
+use utils::shard::{ShardCount, ShardNumber};

 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -35,6 +39,8 @@ pub struct SafekeeperPostgresHandler {
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
    pub ttid: TenantTimelineId,
+    pub shard: Option<ShardIdentity>,
+    pub protocol: Option<PostgresClientProtocol>,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
@@ -107,11 +113,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        if let FeStartupPacket::StartupMessage { params, .. } = sm {
            if let Some(options) = params.options_raw() {
+                let mut shard_count: Option<u8> = None;
+                let mut shard_number: Option<u8> = None;
+                let mut shard_stripe_size: Option<u32> = None;
+
                for opt in options {
                    // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy,
                    // remove these after the PR gets deployed:
                    // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                    match opt.split_once('=') {
+                        Some(("protocol", value)) => {
+                            self.protocol =
+                                Some(serde_json::from_str(value).with_context(|| {
+                                    format!("Failed to parse {value} as protocol")
+                                })?);
+                        }
                        Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
                            self.tenant_id = Some(value.parse().with_context(|| {
                                format!("Failed to parse {value} as tenant id")
@@ -127,9 +143,54 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                                metrics.set_client_az(client_az)
                            }
                        }
+                        Some(("shard_count", value)) => {
+                            shard_count = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard count")
+                            })?);
+                        }
+                        Some(("shard_number", value)) => {
+                            shard_number = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard number")
+                            })?);
+                        }
+                        Some(("shard_stripe_size", value)) => {
+                            shard_stripe_size = Some(value.parse::<u32>().with_context(|| {
+                                format!("Failed to parse {value} as shard stripe size")
+                            })?);
+                        }
                        _ => continue,
                    }
                }
+
+                match self.protocol() {
+                    PostgresClientProtocol::Vanilla => {
+                        if shard_count.is_some()
+                            || shard_number.is_some()
+                            || shard_stripe_size.is_some()
+                        {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Shard params specified for vanilla protocol"
+                            )));
+                        }
+                    }
+                    PostgresClientProtocol::Interpreted { .. } => {
+                        match (shard_count, shard_number, shard_stripe_size) {
+                            (Some(count), Some(number), Some(stripe_size)) => {
+                                let params = ShardParameters {
+                                    count: ShardCount(count),
+                                    stripe_size: ShardStripeSize(stripe_size),
+                                };
+                                self.shard =
+                                    Some(ShardIdentity::from_params(ShardNumber(number), &params));
+                            }
+                            _ => {
+                                return Err(QueryError::Other(anyhow::anyhow!(
+                                    "Shard params were not specified"
+                                )));
+                            }
+                        }
+                    }
+                }
            }

            if let Some(app_name) = params.get("application_name") {
@@ -150,6 +211,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    tracing::field::debug(self.appname.clone()),
                );

+            if let Some(shard) = self.shard.as_ref() {
+                tracing::Span::current()
+                    .record("shard", tracing::field::display(shard.shard_slug()));
+            }
+
            Ok(())
        } else {
            Err(QueryError::Other(anyhow::anyhow!(
@@ -258,6 +324,8 @@ impl SafekeeperPostgresHandler {
            tenant_id: None,
            timeline_id: None,
            ttid: TenantTimelineId::empty(),
+            shard: None,
+            protocol: None,
            conn_id,
            claims: None,
            auth,
@@ -265,6 +333,10 @@ impl SafekeeperPostgresHandler {
        }
    }

+    pub fn protocol(&self) -> PostgresClientProtocol {
+        self.protocol.unwrap_or(PostgresClientProtocol::Vanilla)
+    }
+
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -29,6 +29,7 @@ pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
 pub mod safekeeper;
+pub mod send_interpreted_wal;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
@@ -38,6 +39,7 @@ pub mod timeline_manager;
 pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
+pub mod wal_reader_stream;
 pub mod wal_service;
 pub mod wal_storage;

--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -17,6 +17,7 @@ use tokio::{
 use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::types::PgLsn;
 use tracing::*;
+use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol};
 use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};

 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
@@ -325,7 +326,17 @@ async fn recovery_stream(
    conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
    // TODO: pass auth token
-    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
+    let connection_conf_args = ConnectionConfigArgs {
+        protocol: PostgresClientProtocol::Vanilla,
+        ttid: tli.ttid,
+        shard_number: None,
+        shard_count: None,
+        shard_stripe_size: None,
+        listen_pg_addr_str: &donor.pg_connstr,
+        auth_token: None,
+        availability_zone: None,
+    };
+    let cfg = wal_stream_connection_config(connection_conf_args)?;
    let mut cfg = cfg.to_tokio_postgres_config();
    // It will make safekeeper give out not committed WAL (up to flush_lsn).
    cfg.application_name(&format!("safekeeper_{}", conf.my_id));
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -0,0 +1,148 @@
+use std::time::Duration;
+
+use anyhow::Context;
+use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
+use postgres_ffi::MAX_SEND_SIZE;
+use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
+use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::time::MissedTickBehavior;
+use utils::lsn::Lsn;
+use utils::postgres_client::Compression;
+use utils::postgres_client::InterpretedFormat;
+use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
+use wal_decoder::wire_format::ToWireFormat;
+
+use crate::send_wal::EndWatchView;
+use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
+
+/// Shard-aware interpreted record sender.
+/// This is used for sending WAL to the pageserver. Said WAL
+/// is pre-interpreted and filtered for the shard.
+pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) format: InterpretedFormat,
+    pub(crate) compression: Option<Compression>,
+    pub(crate) pgb: &'a mut PostgresBackend<IO>,
+    pub(crate) wal_stream_builder: WalReaderStreamBuilder,
+    pub(crate) end_watch_view: EndWatchView,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) pg_version: u32,
+    pub(crate) appname: Option<String>,
+}
+
+struct Batch {
+    wal_end_lsn: Lsn,
+    available_wal_end_lsn: Lsn,
+    records: InterpretedWalRecords,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
+    /// Send interpreted WAL to a receiver.
+    /// Stops when an error occurs or the receiver is caught up and there's no active compute.
+    ///
+    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
+    /// convenience.
+    pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut wal_position = self.wal_stream_builder.start_pos();
+        let mut wal_decoder =
+            WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
+
+        let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
+        let mut stream = std::pin::pin!(stream);
+
+        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
+        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        keepalive_ticker.reset();
+
+        let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
+
+        loop {
+            tokio::select! {
+                // Get some WAL from the stream and then: decode, interpret and push it down the
+                // pipeline.
+                wal = stream.next(), if tx.capacity() > 0 => {
+                    let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
+                        Some(some) => some?,
+                        None => { break; }
+                    };
+
+                    wal_position = wal_end_lsn;
+                    wal_decoder.feed_bytes(&wal);
+
+                    let mut records = Vec::new();
+                    let mut max_next_record_lsn = None;
+                    while let Some((next_record_lsn, recdata)) = wal_decoder
+                        .poll_decode()
+                        .with_context(|| "Failed to decode WAL")?
+                    {
+                        assert!(next_record_lsn.is_aligned());
+                        max_next_record_lsn = Some(next_record_lsn);
+
+                        // Deserialize and interpret WAL record
+                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                            recdata,
+                            &self.shard,
+                            next_record_lsn,
+                            self.pg_version,
+                        )
+                        .with_context(|| "Failed to interpret WAL")?;
+
+                        if !interpreted.is_empty() {
+                            records.push(interpreted);
+                        }
+                    }
+
+                    let batch = InterpretedWalRecords {
+                        records,
+                        next_record_lsn: max_next_record_lsn
+                    };
+
+                    tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
+                },
+                // For a previously interpreted batch, serialize it and push it down the wire.
+                batch = rx.recv() => {
+                    let batch = match batch {
+                        Some(b) => b,
+                        None => { break; }
+                    };
+
+                    let buf = batch
+                        .records
+                        .to_wire(self.format, self.compression)
+                        .await
+                        .with_context(|| "Failed to serialize interpreted WAL")
+                        .map_err(CopyStreamHandlerEnd::from)?;
+
+                    // Reset the keep alive ticker since we are sending something
+                    // over the wire now.
+                    keepalive_ticker.reset();
+
+                    self.pgb
+                        .write_message(&BeMessage::InterpretedWalRecords(InterpretedWalRecordsBody {
+                            streaming_lsn: batch.wal_end_lsn.0,
+                            commit_lsn: batch.available_wal_end_lsn.0,
+                            data: &buf,
+                        })).await?;
+                }
+                // Send a periodic keep alive when the connection has been idle for a while.
+                _ = keepalive_ticker.tick() => {
+                    self.pgb
+                        .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                            wal_end: self.end_watch_view.get().0,
+                            timestamp: get_current_timestamp(),
+                            request_reply: true,
+                        }))
+                        .await?;
+                }
+            }
+        }
+
+        // The loop above ends when the receiver is caught up and there's no more WAL to send.
+        Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+            "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
+            self.appname, wal_position,
+        )))
+    }
+}
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,12 +5,15 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
+use crate::send_interpreted_wal::InterpretedWalSender;
 use crate::timeline::WalResidentTimeline;
+use crate::wal_reader_stream::WalReaderStreamBuilder;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
+use futures::future::Either;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -22,6 +25,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::PostgresClientProtocol;

 use std::cmp::{max, min};
 use std::net::SocketAddr;
@@ -226,7 +230,7 @@ impl WalSenders {

    /// Get remote_consistent_lsn reported by the pageserver. Returns None if
    /// client is not pageserver.
-    fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
+    pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
        let shared = self.mutex.lock();
        let slot = shared.get_slot(id);
        match slot.feedback {
@@ -370,6 +374,16 @@ pub struct WalSenderGuard {
    walsenders: Arc<WalSenders>,
 }

+impl WalSenderGuard {
+    pub fn id(&self) -> WalSenderId {
+        self.id
+    }
+
+    pub fn walsenders(&self) -> &Arc<WalSenders> {
+        &self.walsenders
+    }
+}
+
 impl Drop for WalSenderGuard {
    fn drop(&mut self) {
        self.walsenders.unregister(self.id);
@@ -440,11 +454,12 @@ impl SafekeeperPostgresHandler {
        }

        info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={:?}",
            start_pos,
            end_pos,
            matches!(end_watch, EndWatch::Flush(_)),
-            appname
+            appname,
+            self.protocol(),
        );

        // switch to copy
@@ -456,21 +471,56 @@ impl SafekeeperPostgresHandler {
        // not synchronized with sends, so this avoids deadlocks.
        let reader = pgb.split().context("START_REPLICATION split")?;

+        let send_fut = match self.protocol() {
+            PostgresClientProtocol::Vanilla => {
+                let sender = WalSender {
+                    pgb,
+                    // should succeed since we're already holding another guard
+                    tli: tli.wal_residence_guard().await?,
+                    appname,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    ws_guard: ws_guard.clone(),
+                    wal_reader,
+                    send_buf: vec![0u8; MAX_SEND_SIZE],
+                };
+
+                Either::Left(sender.run())
+            }
+            PostgresClientProtocol::Interpreted {
+                format,
+                compression,
+            } => {
+                let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
+                let end_watch_view = end_watch.view();
+                let wal_stream_builder = WalReaderStreamBuilder {
+                    tli: tli.wal_residence_guard().await?,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    wal_sender_guard: ws_guard.clone(),
+                };
+
+                let sender = InterpretedWalSender {
+                    format,
+                    compression,
+                    pgb,
+                    wal_stream_builder,
+                    end_watch_view,
+                    shard: self.shard.unwrap(),
+                    pg_version,
+                    appname,
+                };
+
+                Either::Right(sender.run())
+            }
+        };
+
        let tli_cancel = tli.cancel.clone();

-        let mut sender = WalSender {
-            pgb,
-            // should succeed since we're already holding another guard
-            tli: tli.wal_residence_guard().await?,
-            appname,
-            start_pos,
-            end_pos,
-            term,
-            end_watch,
-            ws_guard: ws_guard.clone(),
-            wal_reader,
-            send_buf: vec![0u8; MAX_SEND_SIZE],
-        };
        let mut reply_reader = ReplyReader {
            reader,
            ws_guard: ws_guard.clone(),
@@ -479,7 +529,7 @@ impl SafekeeperPostgresHandler {

        let res = tokio::select! {
            // todo: add read|write .context to these errors
-            r = sender.run() => r,
+            r = send_fut => r,
            r = reply_reader.run() => r,
            _ = tli_cancel.cancelled() => {
                return Err(CopyStreamHandlerEnd::Cancelled);
@@ -504,16 +554,22 @@ impl SafekeeperPostgresHandler {
    }
 }

+/// TODO(vlad): maybe lift this instead
 /// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
 /// given term (recovery by walproposer or peer safekeeper).
-enum EndWatch {
+#[derive(Clone)]
+pub(crate) enum EndWatch {
    Commit(Receiver<Lsn>),
    Flush(Receiver<TermLsn>),
 }

 impl EndWatch {
+    pub(crate) fn view(&self) -> EndWatchView {
+        EndWatchView(self.clone())
+    }
+
    /// Get current end of WAL.
-    fn get(&self) -> Lsn {
+    pub(crate) fn get(&self) -> Lsn {
        match self {
            EndWatch::Commit(r) => *r.borrow(),
            EndWatch::Flush(r) => r.borrow().lsn,
@@ -521,15 +577,44 @@ impl EndWatch {
    }

    /// Wait for the update.
-    async fn changed(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn changed(&mut self) -> anyhow::Result<()> {
        match self {
            EndWatch::Commit(r) => r.changed().await?,
            EndWatch::Flush(r) => r.changed().await?,
        }
        Ok(())
    }
+
+    pub(crate) async fn wait_for_lsn(
+        &mut self,
+        lsn: Lsn,
+        client_term: Option<Term>,
+    ) -> anyhow::Result<Lsn> {
+        loop {
+            let end_pos = self.get();
+            if end_pos > lsn {
+                return Ok(end_pos);
+            }
+            if let EndWatch::Flush(rx) = &self {
+                let curr_term = rx.borrow().term;
+                if let Some(client_term) = client_term {
+                    if curr_term != client_term {
+                        bail!("term changed: requested {}, now {}", client_term, curr_term);
+                    }
+                }
+            }
+            self.changed().await?;
+        }
+    }
 }

+pub(crate) struct EndWatchView(EndWatch);
+
+impl EndWatchView {
+    pub(crate) fn get(&self) -> Lsn {
+        self.0.get()
+    }
+}
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
@@ -566,7 +651,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    ///
    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
    /// convenience.
-    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+    async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
        loop {
            // Wait for the next portion if it is not there yet, or just
            // update our end of WAL available for sending value, we
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -0,0 +1,149 @@
+use std::sync::Arc;
+
+use async_stream::try_stream;
+use bytes::Bytes;
+use futures::Stream;
+use postgres_backend::CopyStreamHandlerEnd;
+use std::time::Duration;
+use tokio::time::timeout;
+use utils::lsn::Lsn;
+
+use crate::{
+    safekeeper::Term,
+    send_wal::{EndWatch, WalSenderGuard},
+    timeline::WalResidentTimeline,
+};
+
+pub(crate) struct WalReaderStreamBuilder {
+    pub(crate) tli: WalResidentTimeline,
+    pub(crate) start_pos: Lsn,
+    pub(crate) end_pos: Lsn,
+    pub(crate) term: Option<Term>,
+    pub(crate) end_watch: EndWatch,
+    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
+}
+
+impl WalReaderStreamBuilder {
+    pub(crate) fn start_pos(&self) -> Lsn {
+        self.start_pos
+    }
+}
+
+pub(crate) struct WalBytes {
+    /// Raw PG WAL
+    pub(crate) wal: Bytes,
+    /// Start LSN of [`Self::wal`]
+    #[allow(dead_code)]
+    pub(crate) wal_start_lsn: Lsn,
+    /// End LSN of [`Self::wal`]
+    pub(crate) wal_end_lsn: Lsn,
+    /// End LSN of WAL available on the safekeeper.
+    ///
+    /// For pagservers this will be commit LSN,
+    /// while for the compute it will be the flush LSN.
+    pub(crate) available_wal_end_lsn: Lsn,
+}
+
+impl WalReaderStreamBuilder {
+    /// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
+    /// The stream terminates when the receiver (pageserver) is fully caught up
+    /// and there's no active computes.
+    pub(crate) async fn build(
+        self,
+        buffer_size: usize,
+    ) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
+        // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
+        // We can make the raw WAL sender use this stream too and remove the duplication.
+        let Self {
+            tli,
+            mut start_pos,
+            mut end_pos,
+            term,
+            mut end_watch,
+            wal_sender_guard,
+        } = self;
+        let mut wal_reader = tli.get_walreader(start_pos).await?;
+        let mut buffer = vec![0; buffer_size];
+
+        const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
+        Ok(try_stream! {
+            loop {
+                let have_something_to_send = end_pos > start_pos;
+
+                if !have_something_to_send {
+                    // wait for lsn
+                    let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
+                    match res {
+                        Ok(ok) => {
+                            end_pos = ok?;
+                        },
+                        Err(_) => {
+                            if let EndWatch::Commit(_) = end_watch {
+                                if let Some(remote_consistent_lsn) = wal_sender_guard
+                                    .walsenders()
+                                    .get_ws_remote_consistent_lsn(wal_sender_guard.id())
+                                {
+                                    if tli.should_walsender_stop(remote_consistent_lsn).await {
+                                        // Stop streaming if the receivers are caught up and
+                                        // there's no active compute. This causes the loop in
+                                        // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
+                                        // to exit and terminate the WAL stream.
+                                        return;
+                                    }
+                                }
+                            }
+
+                            continue;
+                        }
+                    }
+                }
+
+
+                assert!(
+                    end_pos > start_pos,
+                    "nothing to send after waiting for WAL"
+                );
+
+                // try to send as much as available, capped by the buffer size
+                let mut chunk_end_pos = start_pos + buffer_size as u64;
+                // if we went behind available WAL, back off
+                if chunk_end_pos >= end_pos {
+                    chunk_end_pos = end_pos;
+                } else {
+                    // If sending not up to end pos, round down to page boundary to
+                    // avoid breaking WAL record not at page boundary, as protocol
+                    // demands. See walsender.c (XLogSendPhysical).
+                    chunk_end_pos = chunk_end_pos
+                        .checked_sub(chunk_end_pos.block_offset())
+                        .unwrap();
+                }
+                let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
+                let buffer = &mut buffer[..send_size];
+                let send_size: usize;
+                {
+                    // If uncommitted part is being pulled, check that the term is
+                    // still the expected one.
+                    let _term_guard = if let Some(t) = term {
+                        Some(tli.acquire_term(t).await?)
+                    } else {
+                        None
+                    };
+                    // Read WAL into buffer. send_size can be additionally capped to
+                    // segment boundary here.
+                    send_size = wal_reader.read(buffer).await?
+                };
+                let wal = Bytes::copy_from_slice(&buffer[..send_size]);
+
+                yield WalBytes {
+                    wal,
+                    wal_start_lsn: start_pos,
+                    wal_end_lsn: start_pos + send_size as u64,
+                    available_wal_end_lsn: end_pos
+                };
+
+                start_pos += send_size as u64;
+            }
+        })
+    }
+}
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -31,6 +31,7 @@ CREATE TABLE IF NOT EXISTS results (
    duration     INT NOT NULL,
    flaky        BOOLEAN NOT NULL,
    arch         arch DEFAULT 'X64',
+    lfc          BOOLEAN DEFAULT false NOT NULL,
    build_type   TEXT NOT NULL,
    pg_version   INT NOT NULL,
    run_id       BIGINT NOT NULL,
@@ -54,6 +55,7 @@ class Row:
    duration: int
    flaky: bool
    arch: str
+    lfc: bool
    build_type: str
    pg_version: int
    run_id: int
@@ -132,6 +134,7 @@ def ingest_test_result(
            if p["name"].startswith("__")
        }
        arch = parameters.get("arch", "UNKNOWN").strip("'")
+        lfc = parameters.get("lfc", "False") == "True"

        build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
        labels = {label["name"]: label["value"] for label in test["labels"]}
@@ -145,6 +148,7 @@ def ingest_test_result(
            duration=test["time"]["duration"],
            flaky=test["flaky"] or test["retriesStatusChange"],
            arch=arch,
+            lfc=lfc,
            build_type=build_type,
            pg_version=pg_version,
            run_id=run_id,
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -128,7 +128,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(

                    let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
                    if let Some(err) = check_valid_layermap(&layer_names) {
-                        result.errors.push(format!(
+                        result.warnings.push(format!(
                            "index_part.json contains invalid layer map structure: {err}"
                        ));
                    }
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -168,6 +168,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
    counter("pageserver_tenant_throttling_count_accounted_start"),
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -90,10 +90,12 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    COMPONENT_BINARIES,
+    USE_LFC,
    allure_add_grafana_links,
    assert_no_errors,
    get_dir_size,
    print_gc_result,
+    size_to_bytes,
    subprocess_capture,
    wait_until,
 )
@@ -308,6 +310,31 @@ class PgProtocol:
        return self.safe_psql(query, log_query=log_query)[0][0]


+class PageserverWalReceiverProtocol(StrEnum):
+    VANILLA = "vanilla"
+    INTERPRETED = "interpreted"
+
+    @staticmethod
+    def to_config_key_value(proto) -> tuple[str, dict[str, Any]]:
+        if proto == PageserverWalReceiverProtocol.VANILLA:
+            return (
+                "wal_receiver_protocol",
+                {
+                    "type": "vanilla",
+                },
+            )
+        elif proto == PageserverWalReceiverProtocol.INTERPRETED:
+            return (
+                "wal_receiver_protocol",
+                {
+                    "type": "interpreted",
+                    "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
+                },
+            )
+        else:
+            raise ValueError(f"Unknown protocol type: {proto}")
+
+
 class NeonEnvBuilder:
    """
    Builder object to create a Neon runtime environment
@@ -354,6 +381,7 @@ class NeonEnvBuilder:
        safekeeper_extra_opts: list[str] | None = None,
        storage_controller_port_override: int | None = None,
        pageserver_virtual_file_io_mode: str | None = None,
+        pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -407,6 +435,8 @@ class NeonEnvBuilder:

        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

+        self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+
        assert test_name.startswith(
            "test_"
        ), "Unexpectedly instantiated from outside a test function"
@@ -1021,6 +1051,7 @@ class NeonEnv:

        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
        self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
+        self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol

        # Create the neon_local's `NeonLocalInitConf`
        cfg: dict[str, Any] = {
@@ -1090,6 +1121,13 @@ class NeonEnv:
            if self.pageserver_virtual_file_io_mode is not None:
                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode

+            if self.pageserver_wal_receiver_protocol is not None:
+                key, value = PageserverWalReceiverProtocol.to_config_key_value(
+                    self.pageserver_wal_receiver_protocol
+                )
+                if key not in ps_cfg:
+                    ps_cfg[key] = value
+
            # Create a corresponding NeonPageserver object
            self.pageservers.append(
                NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
@@ -3742,12 +3780,45 @@ class Endpoint(PgProtocol, LogUtils):
        self.pgdata_dir = self.env.repo_dir / path
        self.logfile = self.endpoint_path() / "compute.log"

-        config_lines = config_lines or []
-
        # set small 'max_replication_write_lag' to enable backpressure
        # and make tests more stable.
        config_lines = ["max_replication_write_lag=15MB"] + config_lines

+        # Delete file cache if it exists (and we're recreating the endpoint)
+        if USE_LFC:
+            if (lfc_path := Path(self.lfc_path())).exists():
+                lfc_path.unlink()
+            else:
+                lfc_path.parent.mkdir(parents=True, exist_ok=True)
+            for line in config_lines:
+                if (
+                    line.find("neon.max_file_cache_size") > -1
+                    or line.find("neon.file_cache_size_limit") > -1
+                ):
+                    m = re.search(r"=\s*(\S+)", line)
+                    assert m is not None, f"malformed config line {line}"
+                    size = m.group(1)
+                    assert size_to_bytes(size) >= size_to_bytes(
+                        "1MB"
+                    ), "LFC size cannot be set less than 1MB"
+            # shared_buffers = 512kB to make postgres use LFC intensively
+            # neon.max_file_cache_size and neon.file_cache size limit are
+            # set to 1MB because small LFC is better for testing (helps to find more problems)
+            config_lines = [
+                "shared_buffers = 512kB",
+                f"neon.file_cache_path = '{self.lfc_path()}'",
+                "neon.max_file_cache_size = 1MB",
+                "neon.file_cache_size_limit = 1MB",
+            ] + config_lines
+        else:
+            for line in config_lines:
+                assert (
+                    line.find("neon.max_file_cache_size") == -1
+                ), "Setting LFC parameters is not allowed when LFC is disabled"
+                assert (
+                    line.find("neon.file_cache_size_limit") == -1
+                ), "Setting LFC parameters is not allowed when LFC is disabled"
+
        self.config(config_lines)

        return self
@@ -3781,6 +3852,9 @@ class Endpoint(PgProtocol, LogUtils):
            basebackup_request_tries=basebackup_request_tries,
        )
        self._running.release(1)
+        self.log_config_value("shared_buffers")
+        self.log_config_value("neon.max_file_cache_size")
+        self.log_config_value("neon.file_cache_size_limit")

        return self

@@ -3806,6 +3880,10 @@ class Endpoint(PgProtocol, LogUtils):
        """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
        return self.endpoint_path() / "postgresql.conf"

+    def lfc_path(self) -> Path:
+        """Path to the lfc file"""
+        return self.endpoint_path() / "file_cache" / "file.cache"
+
    def config(self, lines: list[str]) -> Self:
        """
        Add lines to postgresql.conf.
@@ -3984,16 +4062,46 @@ class Endpoint(PgProtocol, LogUtils):
        assert self.pgdata_dir is not None  # please mypy
        return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024

-    def clear_shared_buffers(self, cursor: Any | None = None):
+    def clear_buffers(self, cursor: Any | None = None):
        """
        Best-effort way to clear postgres buffers. Pinned buffers will not be 'cleared.'
-
-        Might also clear LFC.
+        It clears LFC as well by setting neon.file_cache_size_limit to 0 and then returning it to the previous value,
+        if LFC is enabled
        """
        if cursor is not None:
            cursor.execute("select clear_buffer_cache()")
+            if not USE_LFC:
+                return
+            cursor.execute("SHOW neon.file_cache_size_limit")
+            res = cursor.fetchone()
+            assert res, "Cannot get neon.file_cache_size_limit"
+            file_cache_size_limit = res[0]
+            if file_cache_size_limit == 0:
+                return
+            cursor.execute("ALTER SYSTEM SET neon.file_cache_size_limit=0")
+            cursor.execute("SELECT pg_reload_conf()")
+            cursor.execute(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
+            cursor.execute("SELECT pg_reload_conf()")
        else:
            self.safe_psql("select clear_buffer_cache()")
+            if not USE_LFC:
+                return
+            file_cache_size_limit = self.safe_psql_scalar(
+                "SHOW neon.file_cache_size_limit", log_query=False
+            )
+            if file_cache_size_limit == 0:
+                return
+            self.safe_psql("ALTER SYSTEM SET neon.file_cache_size_limit=0")
+            self.safe_psql("SELECT pg_reload_conf()")
+            self.safe_psql(f"ALTER SYSTEM SET neon.file_cache_size_limit='{file_cache_size_limit}'")
+            self.safe_psql("SELECT pg_reload_conf()")
+
+    def log_config_value(self, param):
+        """
+        Writes the config value param to log
+        """
+        res = self.safe_psql_scalar(f"SHOW {param}", log_query=False)
+        log.info("%s = %s", param, res)


 class EndpointFactory:
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -116,5 +116,6 @@ def pytest_runtest_makereport(*args, **kwargs):
    }.get(os.uname().machine, "UNKNOWN")
    arch = os.getenv("RUNNER_ARCH", uname_m)
    allure.dynamic.parameter("__arch", arch)
+    allure.dynamic.parameter("__lfc", os.getenv("USE_LFC") != "false")

    yield
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -57,6 +57,10 @@ VERSIONS_COMBINATIONS = (
 )
 # fmt: on

+# If the environment variable USE_LFC is set and its value is "false", then LFC is disabled for tests.
+# If it is not set or set to a value not equal to "false", LFC is enabled by default.
+USE_LFC = os.environ.get("USE_LFC") != "false"
+

 def subprocess_capture(
    capture_dir: Path,
@@ -653,6 +657,23 @@ def allpairs_versions():
    return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}


+def size_to_bytes(hr_size: str) -> int:
+    """
+    Gets human-readable size from postgresql.conf (e.g. 512kB, 10MB)
+    returns size in bytes
+    """
+    units = {"B": 1, "kB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4, "PB": 1024**5}
+    match = re.search(r"^\'?(\d+)\s*([kMGTP]?B)?\'?$", hr_size)
+    assert match is not None, f'"{hr_size}" is not a well-formatted human-readable size'
+    number, unit = match.groups()
+
+    if unit:
+        amp = units[unit]
+    else:
+        amp = 8192
+    return int(number) * amp
+
+
 def skip_on_postgres(version: PgVersion, reason: str):
    return pytest.mark.skipif(
        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -193,7 +193,7 @@ class Workload:

    def validate(self, pageserver_id: int | None = None):
        endpoint = self.endpoint(pageserver_id)
-        endpoint.clear_shared_buffers()
+        endpoint.clear_buffers()
        result = endpoint.safe_psql(f"SELECT COUNT(*) FROM {self.table}")

        log.info(f"validate({self.expect_rows}): {result}")
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -15,6 +15,7 @@ Some handy pytest flags for local development:
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
 - `--preserve-database-files` to skip cleanup
+- `--out-dir` to produce a JSON with the recorded test metrics

 # What performance tests do we have and how we run them

@@ -36,6 +37,6 @@ All tests run only once. Usually to obtain more consistent performance numbers,

 ## Results collection

-Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
+Local test results for main branch, and results of daily performance tests, are stored in a [neon project](https://console.neon.tech/app/projects/withered-sky-69117821) deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.

 There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing.
--- a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
+++ b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
@@ -0,0 +1,307 @@
+import dataclasses
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import humantime_to_ms
+
+TARGET_RUNTIME = 60
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    [
+        # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
+        (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
+        # the next 4 cases demonstrate how batchable workloads benefit from batching
+        (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
+        (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
+    ],
+)
+def test_getpage_merge_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    tablesize_mib: int,
+    batch_timeout: str | None,
+    target_runtime: int,
+    effective_io_concurrency: int,
+    readhead_buffer_size: int,
+    name: str,
+):
+    """
+    Do a bunch of sequential scans and ensure that the pageserver does some merging.
+    """
+
+    #
+    # record perf-related parameters as metrics to simplify processing of results
+    #
+    params: dict[str, tuple[float | int, dict[str, Any]]] = {}
+
+    params.update(
+        {
+            "tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
+            "batch_timeout": (
+                -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
+                {"unit": "us"},
+            ),
+            # target_runtime is just a polite ask to the workload to run for this long
+            "effective_io_concurrency": (effective_io_concurrency, {}),
+            "readhead_buffer_size": (readhead_buffer_size, {}),
+            # name is not a metric
+        }
+    )
+
+    log.info("params: %s", params)
+
+    for param, (value, kwargs) in params.items():
+        zenbenchmark.record(
+            param,
+            metric_value=value,
+            unit=kwargs.pop("unit", ""),
+            report=MetricReport.TEST_PARAM,
+            **kwargs,
+        )
+
+    #
+    # Setup
+    #
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}")
+    cur.execute(
+        f"SET neon.readahead_buffer_size={readhead_buffer_size}"
+    )  # this is the current default value, but let's hard-code that
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = tablesize_mib * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    #
+    # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
+    #
+
+    @dataclass
+    class Metrics:
+        time: float
+        pageserver_getpage_count: float
+        pageserver_vectored_get_count: float
+        compute_getpage_count: float
+        pageserver_cpu_seconds_total: float
+
+        def __sub__(self, other: "Metrics") -> "Metrics":
+            return Metrics(
+                time=self.time - other.time,
+                pageserver_getpage_count=self.pageserver_getpage_count
+                - other.pageserver_getpage_count,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count
+                - other.pageserver_vectored_get_count,
+                compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
+                - other.pageserver_cpu_seconds_total,
+            )
+
+        def normalize(self, by) -> "Metrics":
+            return Metrics(
+                time=self.time / by,
+                pageserver_getpage_count=self.pageserver_getpage_count / by,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                compute_getpage_count=self.compute_getpage_count / by,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+            )
+
+    def get_metrics() -> Metrics:
+        with conn.cursor() as cur:
+            cur.execute(
+                "select value from neon_perf_counters where metric='getpage_wait_seconds_count';"
+            )
+            compute_getpage_count = cur.fetchall()[0][0]
+            pageserver_metrics = ps_http.get_metrics()
+            return Metrics(
+                time=time.time(),
+                pageserver_getpage_count=pageserver_metrics.query_one(
+                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                ).value,
+                pageserver_vectored_get_count=pageserver_metrics.query_one(
+                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                ).value,
+                compute_getpage_count=compute_getpage_count,
+                pageserver_cpu_seconds_total=pageserver_metrics.query_one(
+                    "libmetrics_process_cpu_seconds_highres"
+                ).value,
+            )
+
+    def workload() -> Metrics:
+        start = time.time()
+        iters = 0
+        while time.time() - start < target_runtime or iters < 2:
+            log.info("Seqscan %d", iters)
+            if iters == 1:
+                # round zero for warming up
+                before = get_metrics()
+            cur.execute(
+                "select clear_buffer_cache()"
+            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
+            cur.execute("select sum(data::bigint) from t")
+            assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
+            iters += 1
+        after = get_metrics()
+        return (after - before).normalize(iters - 1)
+
+    env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
+    env.pageserver.restart()
+    metrics = workload()
+
+    log.info("Results: %s", metrics)
+
+    #
+    # Sanity-checks on the collected data
+    #
+    # assert that getpage counts roughly match between compute and ps
+    assert metrics.pageserver_getpage_count == pytest.approx(
+        metrics.compute_getpage_count, rel=0.01
+    )
+
+    #
+    # Record the results
+    #
+
+    for metric, value in dataclasses.asdict(metrics).items():
+        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+
+    zenbenchmark.record(
+        "perfmetric.batching_factor",
+        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
+)
+def test_timer_precision(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    batch_timeout: str | None,
+):
+    """
+    Determine the batching timeout precision (mean latency) and tail latency impact.
+
+    The baseline is `None`; an ideal batching timeout implementation would increase
+    the mean latency by exactly `batch_timeout`.
+
+    That is not the case with the current implementation, will be addressed in future changes.
+    """
+
+    #
+    # Setup
+    #
+
+    def patch_ps_config(ps_config):
+        ps_config["server_side_batch_timeout"] = batch_timeout
+
+    neon_env_builder.pageserver_config_override = patch_ps_config
+
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute("SET effective_io_concurrency=1")
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = 50 * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    cur.close()
+    conn.close()
+
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    endpoint.stop()
+
+    for sk in env.safekeepers:
+        sk.stop()
+
+    #
+    # Run single-threaded pagebench (TODO: dedup with other benchmark code)
+    #
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--num-clients",
+        "1",
+        "--runtime",
+        "10s",
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path) as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "latency_mean"
+    zenbenchmark.record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        zenbenchmark.record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -103,6 +103,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
                    cur.execute(f"update tbl{i} set j = {j};")

    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False
+    )  # ^1: flush all in-memory layers
    endpoint.stop()

    # Check we have generated the L0 stack we expected
@@ -118,7 +121,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
        return v * 1024

    before = rss_hwm()
-    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    pageserver_http.timeline_compact(
+        tenant_id, timeline_id
+    )  # ^1: we must ensure during this process no new L0 layers are flushed
    after = rss_hwm()

    log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
@@ -137,7 +142,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
    # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
    # this memory estimate can be revised far downwards to something that doesn't scale
    # linearly with the layer sizes.
-    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25

    # If we find that compaction is using more memory, this may indicate a regression
    assert compaction_mapped_rss < MEMORY_ESTIMATE
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
        "--no-acl",
        "--skip-db-properties",
        "--table-jobs",
-        "4",
+        "8",
        "--index-jobs",
-        "4",
+        "8",
        "--restore-jobs",
-        "4",
+        "8",
        "--split-tables-larger-than",
-        "10GB",
+        "5GB",
        "--skip-extensions",
        "--use-copy-binary",
        "--filters",
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
        "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
        "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
        "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16",
    }
    # Combine the current environment with custom variables
    env = os.environ.copy()
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -15,21 +15,61 @@ from fixtures.neon_fixtures import (

@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [1, 8, 32])
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [
+        "vanilla",
+        "interpreted-bincode-compressed",
+        "interpreted-protobuf-compressed",
+    ],
+)
 def test_sharded_ingest(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    shard_count: int,
+    wal_receiver_protocol: str,
 ):
    """
    Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
    and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
    (shard_count=1) to the sharded case indicates the overhead of sharding.
    """
-
    ROW_COUNT = 100_000_000  # about 7 GB of WAL

    neon_env_builder.num_pageservers = shard_count
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+
+    for ps in env.pageservers:
+        if wal_receiver_protocol == "vanilla":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "vanilla",
+                    }
+                }
+            )
+        elif wal_receiver_protocol == "interpreted-bincode-compressed":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "interpreted",
+                        "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+                    }
+                }
+            )
+        elif wal_receiver_protocol == "interpreted-protobuf-compressed":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "interpreted",
+                        "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
+                    }
+                }
+            )
+        else:
+            raise AssertionError("Test must use explicit wal receiver protocol config")
+
+    env.start()

    # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
    # the storage controller doesn't mess with shard placements.
@@ -50,7 +90,6 @@ def test_sharded_ingest(
    # Start the endpoint.
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
-
    # Ingest data and measure WAL volume and duration.
    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
@@ -68,4 +107,48 @@ def test_sharded_ingest(
    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)

+    total_ingested = 0
+    total_records_received = 0
+    ingested_by_ps = []
+    for pageserver in env.pageservers:
+        ingested = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_bytes_received_total"
+        )
+        records_received = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_records_received_total"
+        )
+
+        if ingested is None:
+            ingested = 0
+
+        if records_received is None:
+            records_received = 0
+
+        ingested_by_ps.append(
+            (
+                pageserver.id,
+                {
+                    "ingested": ingested,
+                    "records_received": records_received,
+                },
+            )
+        )
+
+        total_ingested += int(ingested)
+        total_records_received += int(records_received)
+
+    total_ingested_mb = total_ingested / (1024 * 1024)
+    zenbenchmark.record("wal_ingested", total_ingested_mb, "MB", MetricReport.LOWER_IS_BETTER)
+    zenbenchmark.record(
+        "records_received", total_records_received, "records", MetricReport.LOWER_IS_BETTER
+    )
+
+    ingested_by_ps.sort(key=lambda x: x[0])
+    for _, stats in ingested_by_ps:
+        for k in stats:
+            if k != "records_received":
+                stats[k] /= 1024**2
+
+    log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
+
    assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -174,6 +174,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "lsn_lease_length": "1m",
        "lsn_lease_length_for_ts": "5s",
        "timeline_offloading": True,
+        "wal_receiver_protocol_override": {
+            "type": "interpreted",
+            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+        },
    }

    vps_http = env.storage_controller.pageserver_api()
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -5,12 +5,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver

 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "shared_buffers='1MB'",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")

    conn = endpoint.connect()
    cur = conn.cursor()
@@ -36,7 +31,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):

    # Clear the cache, so that we exercise reconstructing the pages
    # from WAL
-    endpoint.clear_shared_buffers()
+    endpoint.clear_buffers()

    # Check that the cursor opened earlier still works. If the
    # combocids are not restored correctly, it won't.
@@ -65,12 +60,7 @@ def test_combocid_lock(neon_env_builder: NeonEnvBuilder):

 def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "shared_buffers='1MB'",
-        ],
-    )
+    endpoint = env.endpoints.create_start("main")

    conn = endpoint.connect()
    cur = conn.cursor()
@@ -98,7 +88,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
    cur.execute("delete from t")
    # Clear the cache, so that we exercise reconstructing the pages
    # from WAL
-    endpoint.clear_shared_buffers()
+    endpoint.clear_buffers()

    # Check that the cursor opened earlier still works. If the
    # combocids are not restored correctly, it won't.
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -8,6 +8,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -27,7 +28,13 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {


@skip_in_debug_build("only run with release build")
-def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_pageserver_compaction_smoke(
+    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+):
    """
    This is a smoke test that compaction kicks in. The workload repeatedly churns
    a small number of rows and manually instructs the pageserver to run compaction
@@ -36,6 +43,8 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
    observed bounds.
    """

+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
+
    # Effectively disable the page cache to rely only on image layers
    # to shorten reads.
    neon_env_builder.pageserver_config_override = """
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_cli import WalCraft
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol

 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
@@ -19,7 +19,17 @@ from fixtures.neon_fixtures import NeonEnvBuilder
        "wal_record_crossing_segment_followed_by_small_one",
    ],
 )
-def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_crafted_wal_end(
+    neon_env_builder: NeonEnvBuilder,
+    wal_type: str,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
+):
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
+
    env = neon_env_builder.init_start()
    env.create_branch("test_crafted_wal_end")
    env.pageserver.allowed_errors.extend(
--- a/test_runner/regress/test_explain_with_lfc_stats.py
+++ b/test_runner/regress/test_explain_with_lfc_stats.py
@@ -2,10 +2,13 @@ from __future__ import annotations

 from pathlib import Path

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
+from fixtures.utils import USE_LFC


+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
    env = neon_simple_env

@@ -16,8 +19,6 @@ def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
    endpoint = env.endpoints.create_start(
        "main",
        config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
            "neon.max_file_cache_size='128MB'",
            "neon.file_cache_size_limit='64MB'",
        ],
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -170,7 +170,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            # re-execute the query, it will make GetPage
            # requests. This does not clear the last-written LSN cache
            # so we still remember the LSNs of the pages.
-            secondary.clear_shared_buffers(cursor=s_cur)
+            secondary.clear_buffers(cursor=s_cur)

            if pause_apply:
                s_cur.execute("SELECT pg_wal_replay_pause()")
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,6 +1,5 @@
 from __future__ import annotations

-import os
 import random
 import re
 import subprocess
@@ -10,20 +9,24 @@ import time
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
+from fixtures.utils import USE_LFC


@pytest.mark.timeout(600)
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
    """
    Test resizing the Local File Cache
    """
    env = neon_simple_env
+    cache_dir = env.repo_dir / "file_cache"
+    cache_dir.mkdir(exist_ok=True)
+    env.create_branch("test_lfc_resize")
    endpoint = env.endpoints.create_start(
        "main",
        config_lines=[
-            "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=512MB",
-            "neon.file_cache_size_limit=512MB",
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
        ],
    )
    n_resize = 10
@@ -63,8 +66,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
    cur.execute("select pg_reload_conf()")
    nretries = 10
    while True:
-        lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-        lfc_file_size = os.path.getsize(lfc_file_path)
+        lfc_file_path = endpoint.lfc_path()
+        lfc_file_size = lfc_file_path.stat().st_size
        res = subprocess.run(
            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
        )
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -3,11 +3,13 @@ from __future__ import annotations
 import time
 from pathlib import Path

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import query_scalar
+from fixtures.utils import USE_LFC, query_scalar


+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
    env = neon_simple_env

@@ -18,8 +20,6 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
    endpoint = env.endpoints.create_start(
        "main",
        config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
            "neon.max_file_cache_size='128MB'",
            "neon.file_cache_size_limit='64MB'",
        ],
@@ -72,9 +72,10 @@ WITH (fillfactor='100');
    # verify working set size after some index access of a few select pages only
    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
    log.info(f"working set size after some index access of a few select pages only {blocks}")
-    assert blocks < 10
+    assert blocks < 12


+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
    env = neon_simple_env

--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -6,10 +6,12 @@ import random
 import threading
 import time

+import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.utils import query_scalar
+from fixtures.utils import USE_LFC, query_scalar


+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

@@ -19,8 +21,6 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
    endpoint = env.endpoints.create_start(
        "main",
        config_lines=[
-            "shared_buffers='1MB'",
-            f"neon.file_cache_path='{cache_dir}/file.cache'",
            "neon.max_file_cache_size='64MB'",
            "neon.file_cache_size_limit='10MB'",
        ],
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -12,7 +12,7 @@ from fixtures.neon_fixtures import (
    logical_replication_sync,
    wait_for_last_flush_lsn,
 )
-from fixtures.utils import wait_until
+from fixtures.utils import USE_LFC, wait_until

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import (
@@ -576,7 +576,15 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
    # We want all data to fit into shared_buffers because later we stop
    # safekeeper and insert more; this shouldn't cause page requests as they
    # will be stuck.
-    sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
+    sub = env.endpoints.create(
+        "subscriber",
+        config_lines=[
+            "neon.max_file_cache_size = 32MB",
+            "neon.file_cache_size_limit = 32MB",
+        ]
+        if USE_LFC
+        else [],
+    )
    sub.start()

    with vanilla_pg.cursor() as pcur:
--- a/test_runner/regress/test_oid_overflow.py
+++ b/test_runner/regress/test_oid_overflow.py
@@ -39,7 +39,7 @@ def test_oid_overflow(neon_env_builder: NeonEnvBuilder):
    oid = cur.fetchall()[0][0]
    log.info(f"t2.relfilenode={oid}")

-    endpoint.clear_shared_buffers(cursor=cur)
+    endpoint.clear_buffers(cursor=cur)

    cur.execute("SELECT x from t1")
    assert cur.fetchone() == (1,)
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -54,7 +54,7 @@ def test_read_validation(neon_simple_env: NeonEnv):

            log.info("Clear buffer cache to ensure no stale pages are brought into the cache")

-            endpoint.clear_shared_buffers(cursor=c)
+            endpoint.clear_buffers(cursor=c)

            cache_entries = query_scalar(
                c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}"
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -230,7 +230,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
        return offset

    # Insert some records on main branch
-    with env.endpoints.create_start("main") as ep_main:
+    with env.endpoints.create_start("main", config_lines=["shared_buffers=1MB"]) as ep_main:
        with ep_main.cursor() as cur:
            cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
        lsn = Lsn(0)
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,6 +1,11 @@
 from __future__ import annotations

-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
+    check_restored_datadir_content,
+)


 # Test subtransactions
@@ -9,8 +14,14 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # maintained in the pageserver, so subtransactions are not very exciting for
 # Neon. They are included in the commit record though and updated in the
 # CLOG.
-def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
-    env = neon_simple_env
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol):
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
+
+    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")

    pg_conn = endpoint.connect()
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -416,7 +416,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn

    assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None

-    ep.clear_shared_buffers()
+    ep.clear_buffers()
    assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
    assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0
    ep.stop()
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -63,7 +63,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):

    # Clear the buffer cache, to force the VM page to be re-fetched from
    # the page server
-    endpoint.clear_shared_buffers(cursor=cur)
+    endpoint.clear_buffers(cursor=cur)

    # Check that an index-only scan doesn't see the deleted row. If the
    # clearing of the VM bit was not replayed correctly, this would incorrectly
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2446,7 +2446,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder):
        # generate some data to commit WAL on safekeepers
        endpoint.safe_psql("insert into t select generate_series(1,100), 'action'")
        # clear the buffers
-        endpoint.clear_shared_buffers()
+        endpoint.clear_buffers()
        # read data to fetch pages from pageserver
        endpoint.safe_psql("select sum(i) from t")

--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -11,7 +11,13 @@ import pytest
 import toml
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
-from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
+    Safekeeper,
+)
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import skip_in_debug_build

@@ -622,8 +628,15 @@ async def run_segment_init_failure(env: NeonEnv):
 # Test (injected) failure during WAL segment init.
 # https://github.com/neondatabase/neon/issues/6401
 # https://github.com/neondatabase/neon/issues/6402
-def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_segment_init_failure(
+    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+):
    neon_env_builder.num_safekeepers = 1
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
    env = neon_env_builder.init_start()

    asyncio.run(run_segment_init_failure(env))
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
  "v17": [
    "17.2",
-    "cb62fe5f5ddca22f7c91843f4d49b42730605f6c"
+    "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f"
  ],
  "v16": [
    "16.6",