From 4630b709627ffea01dd6863f0db84ec4c693bcd6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 25 Nov 2024 09:25:18 -0500
Subject: [PATCH 01/29] fix(pageserver): ensure all layers are flushed before
 measuring RSS (#9861)

## Problem

close https://github.com/neondatabase/neon/issues/9761

The test assumed that no new L0 layers are flushed throughout the
process, which is not true.

## Summary of changes

Fix the test case `test_compaction_l0_memory` by flushing in-memory
layers before compaction.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/performance/test_compaction.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py
index 8868dddf39..0cd1080fa7 100644
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -103,6 +103,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
                     cur.execute(f"update tbl{i} set j = {j};")
 
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(
+        tenant_id, timeline_id, compact=False
+    )  # ^1: flush all in-memory layers
     endpoint.stop()
 
     # Check we have generated the L0 stack we expected
@@ -118,7 +121,9 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
         return v * 1024
 
     before = rss_hwm()
-    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    pageserver_http.timeline_compact(
+        tenant_id, timeline_id
+    )  # ^1: we must ensure during this process no new L0 layers are flushed
     after = rss_hwm()
 
     log.info(f"RSS across compaction: {before} -> {after} (grew {after - before})")
@@ -137,7 +142,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare):
     # To be fixed in https://github.com/neondatabase/neon/issues/8184, after which
     # this memory estimate can be revised far downwards to something that doesn't scale
     # linearly with the layer sizes.
-    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.5
+    MEMORY_ESTIMATE = (initial_l0s_size - final_l0s_size) * 1.25
 
     # If we find that compaction is using more memory, this may indicate a regression
     assert compaction_mapped_rss < MEMORY_ESTIMATE

From 3d380acbd1dd878b908df860a541ce77bd4506f3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 25 Nov 2024 14:43:32 +0000
Subject: [PATCH 02/29] Bump default Debian version to Bookworm everywhere
 (#9863)

## Problem

We have a couple of CI workflows that still run on Debian Bullseye, and
the default Debian version in images is Bullseye as well (we explicitly
set building on Bookworm)

## Summary of changes
- Run `pgbench-pgvector` on Bookworm (fix a couple of packages)
- Run `trigger_bench_on_ec2_machine_in_eu_central_1` on Bookworm
- Change default `DEBIAN_VERSION` in Dockerfiles to Bookworm
- Make `pinned` docker tag an alias to `pinned-bookworm`
---
 .github/workflows/benchmarking.yml            | 14 +++++++-------
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/periodic_pagebench.yml      |  2 +-
 .github/workflows/pin-build-tools-image.yml   |  2 +-
 Dockerfile                                    |  2 +-
 build-tools.Dockerfile                        |  2 +-
 compute/compute-node.Dockerfile               |  2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 2ad1ee0a42..ea8fee80c2 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -541,7 +541,7 @@ jobs:
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -558,12 +558,12 @@ jobs:
         arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
 
         cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg110+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg110+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg110+1_${arch}.deb pg
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
+        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
 
         mkdir -p /tmp/neon/pg_install/v16/bin
         ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 9e7be76901..93da86a353 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -117,7 +117,7 @@ jobs:
 
       - name: Create multi-arch image
         env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
           IMAGE_TAG: ${{ needs.check-image.outputs.tag }}
         run: |
           for debian_version in bullseye bookworm; do
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 1cce348ae2..6b98bc873f 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -29,7 +29,7 @@ jobs:
   trigger_bench_on_ec2_machine_in_eu_central_1:
     runs-on: [ self-hosted, small ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index c196d07d3e..5b43d97de6 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -94,7 +94,7 @@ jobs:
 
       - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
         env:
-          DEFAULT_DEBIAN_VERSION: bullseye
+          DEFAULT_DEBIAN_VERSION: bookworm
         run: |
           for debian_version in bullseye bookworm; do
             tags=()
diff --git a/Dockerfile b/Dockerfile
index 785dd4598e..e888efbae2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,7 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 # Build Postgres
diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 24e5bbf46f..4f491afec5 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,4 +1,4 @@
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7c21c67a0a..2fcd9985bc 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -3,7 +3,7 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 
 #########################################################################################

From 77630e5408c0771b0e1db020f8d81a7be9728391 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 25 Nov 2024 15:59:12 +0100
Subject: [PATCH 03/29] Address beta clippy lint needless_lifetimes (#9877)

The 1.82.0 version of Rust will be stable soon, let's get the clippy
lint fixes in before the compiler version upgrade.
---
 libs/vm_monitor/src/cgroup.rs                               | 4 ++--
 .../virtual_file/owned_buffers_io/aligned_buffer/slice.rs   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 3223765016..1d70cedcf9 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -218,7 +218,7 @@ impl MemoryStatus {
     fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
         struct DS<'a>(&'a [MemoryStatus]);
 
-        impl<'a> Debug for DS<'a> {
+        impl Debug for DS<'_> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 f.debug_struct("[MemoryStatus]")
                     .field(
@@ -233,7 +233,7 @@ impl MemoryStatus {
 
         struct Fields<'a, F>(&'a [MemoryStatus], F);
 
-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+        impl<F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'_, F> {
             fn fmt(&self, f: &mut Formatter) -> fmt::Result {
                 f.debug_list().entries(self.0.iter().map(&self.1)).finish()
             }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
index 6cecf34c1c..1952b82578 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -19,7 +19,7 @@ impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
     }
 }
 
-impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> Deref for AlignedSlice<'_, N, A> {
     type Target = [u8; N];
 
     fn deref(&self) -> &Self::Target {
@@ -27,13 +27,13 @@ impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
     }
 }
 
-impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> DerefMut for AlignedSlice<'_, N, A> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.buf
     }
 }
 
-impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+impl<const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'_, N, A> {
     fn as_ref(&self) -> &[u8; N] {
         self.buf
     }

From 441612c1ce11f92d0fad1226be0e42f5500c2c46 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 25 Nov 2024 17:21:52 +0200
Subject: [PATCH 04/29] Prefetch on macos (#9875)

## Problem

Prefetch is disabled at MacODS because `posix_fadvise` is not available.
But Neon prefetch is not using this function and for testing at MacOS is
it very convenient that prefetch is available.

## Summary of changes

Define `USE_PREFETCH` in Makefile.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 8e3b755112..dc67b87239 100644
--- a/Makefile
+++ b/Makefile
@@ -38,6 +38,7 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
+	PG_CFLAGS += -DUSE_PREFETCH
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable

From 5c2356988e8beea0418e1292da2adf4b988340ad Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 25 Nov 2024 16:52:39 +0100
Subject: [PATCH 05/29] page_service: add benchmark for batching (#9820)

This PR adds two benchmark to demonstrate the effect of server-side
getpage request batching added in
https://github.com/neondatabase/neon/pull/9321.

For the CPU usage, I found the the `prometheus` crate's built-in CPU
usage accounts the seconds at integer granularity. That's not enough you
reduce the target benchmark runtime for local iteration. So, add a new
`libmetrics` metric and report that.

The benchmarks are disabled because [on our benchmark nodes, timer
resolution isn't high
enough](https://neondb.slack.com/archives/C059ZC138NR/p1732264223207449).
They work (no statement about quality) on my bare-metal devbox.

They will be refined and enabled once we find a fix. Candidates at time
of writing are:
- https://github.com/neondatabase/neon/pull/9822
- https://github.com/neondatabase/neon/pull/9851


Refs:

- Epic: https://github.com/neondatabase/neon/issues/9376
- Extracted from https://github.com/neondatabase/neon/pull/9792
---
 libs/metrics/src/more_process_metrics.rs      |  40 ++-
 test_runner/performance/README.md             |   3 +-
 .../test_pageserver_getpage_merge.py          | 307 ++++++++++++++++++
 3 files changed, 347 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/performance/pageserver/test_pageserver_getpage_merge.py

diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs
index 920724fdec..13a745e031 100644
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -2,14 +2,28 @@
 
 // This module has heavy inspiration from the prometheus crate's `process_collector.rs`.
 
+use once_cell::sync::Lazy;
+use prometheus::Gauge;
+
 use crate::UIntGauge;
 
 pub struct Collector {
     descs: Vec<prometheus::core::Desc>,
     vmlck: crate::UIntGauge,
+    cpu_seconds_highres: Gauge,
 }
 
-const NMETRICS: usize = 1;
+const NMETRICS: usize = 2;
+
+static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
+    let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
+    if long == -1 {
+        panic!("sysconf(_SC_CLK_TCK) failed");
+    }
+    let convertible_to_f64: i32 =
+        i32::try_from(long).expect("sysconf(_SC_CLK_TCK) is larger than i32");
+    convertible_to_f64 as f64
+});
 
 impl prometheus::core::Collector for Collector {
     fn desc(&self) -> Vec<&prometheus::core::Desc> {
@@ -27,6 +41,12 @@ impl prometheus::core::Collector for Collector {
                 mfs.extend(self.vmlck.collect())
             }
         }
+        if let Ok(stat) = myself.stat() {
+            let cpu_seconds = stat.utime + stat.stime;
+            self.cpu_seconds_highres
+                .set(cpu_seconds as f64 / *CLK_TCK_F64);
+            mfs.extend(self.cpu_seconds_highres.collect());
+        }
         mfs
     }
 }
@@ -43,7 +63,23 @@ impl Collector {
                 .cloned(),
         );
 
-        Self { descs, vmlck }
+        let cpu_seconds_highres = Gauge::new(
+            "libmetrics_process_cpu_seconds_highres",
+            "Total user and system CPU time spent in seconds.\
+             Sub-second resolution, hence better than `process_cpu_seconds_total`.",
+        )
+        .unwrap();
+        descs.extend(
+            prometheus::core::Collector::desc(&cpu_seconds_highres)
+                .into_iter()
+                .cloned(),
+        );
+
+        Self {
+            descs,
+            vmlck,
+            cpu_seconds_highres,
+        }
     }
 }
 
diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md
index 70d75a6dcf..85096d3770 100644
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -15,6 +15,7 @@ Some handy pytest flags for local development:
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
 - `--preserve-database-files` to skip cleanup
+- `--out-dir` to produce a JSON with the recorded test metrics
 
 # What performance tests do we have and how we run them
 
@@ -36,6 +37,6 @@ All tests run only once. Usually to obtain more consistent performance numbers,
 
 ## Results collection
 
-Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
+Local test results for main branch, and results of daily performance tests, are stored in a [neon project](https://console.neon.tech/app/projects/withered-sky-69117821) deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks.
 
 There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing.
diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
new file mode 100644
index 0000000000..34cce9900b
--- /dev/null
+++ b/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
@@ -0,0 +1,307 @@
+import dataclasses
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import humantime_to_ms
+
+TARGET_RUNTIME = 60
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    [
+        # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
+        (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
+        # the next 4 cases demonstrate how batchable workloads benefit from batching
+        (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
+        (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
+        (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
+        (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
+    ],
+)
+def test_getpage_merge_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    tablesize_mib: int,
+    batch_timeout: str | None,
+    target_runtime: int,
+    effective_io_concurrency: int,
+    readhead_buffer_size: int,
+    name: str,
+):
+    """
+    Do a bunch of sequential scans and ensure that the pageserver does some merging.
+    """
+
+    #
+    # record perf-related parameters as metrics to simplify processing of results
+    #
+    params: dict[str, tuple[float | int, dict[str, Any]]] = {}
+
+    params.update(
+        {
+            "tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
+            "batch_timeout": (
+                -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
+                {"unit": "us"},
+            ),
+            # target_runtime is just a polite ask to the workload to run for this long
+            "effective_io_concurrency": (effective_io_concurrency, {}),
+            "readhead_buffer_size": (readhead_buffer_size, {}),
+            # name is not a metric
+        }
+    )
+
+    log.info("params: %s", params)
+
+    for param, (value, kwargs) in params.items():
+        zenbenchmark.record(
+            param,
+            metric_value=value,
+            unit=kwargs.pop("unit", ""),
+            report=MetricReport.TEST_PARAM,
+            **kwargs,
+        )
+
+    #
+    # Setup
+    #
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}")
+    cur.execute(
+        f"SET neon.readahead_buffer_size={readhead_buffer_size}"
+    )  # this is the current default value, but let's hard-code that
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = tablesize_mib * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    #
+    # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
+    #
+
+    @dataclass
+    class Metrics:
+        time: float
+        pageserver_getpage_count: float
+        pageserver_vectored_get_count: float
+        compute_getpage_count: float
+        pageserver_cpu_seconds_total: float
+
+        def __sub__(self, other: "Metrics") -> "Metrics":
+            return Metrics(
+                time=self.time - other.time,
+                pageserver_getpage_count=self.pageserver_getpage_count
+                - other.pageserver_getpage_count,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count
+                - other.pageserver_vectored_get_count,
+                compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
+                - other.pageserver_cpu_seconds_total,
+            )
+
+        def normalize(self, by) -> "Metrics":
+            return Metrics(
+                time=self.time / by,
+                pageserver_getpage_count=self.pageserver_getpage_count / by,
+                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                compute_getpage_count=self.compute_getpage_count / by,
+                pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+            )
+
+    def get_metrics() -> Metrics:
+        with conn.cursor() as cur:
+            cur.execute(
+                "select value from neon_perf_counters where metric='getpage_wait_seconds_count';"
+            )
+            compute_getpage_count = cur.fetchall()[0][0]
+            pageserver_metrics = ps_http.get_metrics()
+            return Metrics(
+                time=time.time(),
+                pageserver_getpage_count=pageserver_metrics.query_one(
+                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                ).value,
+                pageserver_vectored_get_count=pageserver_metrics.query_one(
+                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                ).value,
+                compute_getpage_count=compute_getpage_count,
+                pageserver_cpu_seconds_total=pageserver_metrics.query_one(
+                    "libmetrics_process_cpu_seconds_highres"
+                ).value,
+            )
+
+    def workload() -> Metrics:
+        start = time.time()
+        iters = 0
+        while time.time() - start < target_runtime or iters < 2:
+            log.info("Seqscan %d", iters)
+            if iters == 1:
+                # round zero for warming up
+                before = get_metrics()
+            cur.execute(
+                "select clear_buffer_cache()"
+            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
+            cur.execute("select sum(data::bigint) from t")
+            assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
+            iters += 1
+        after = get_metrics()
+        return (after - before).normalize(iters - 1)
+
+    env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
+    env.pageserver.restart()
+    metrics = workload()
+
+    log.info("Results: %s", metrics)
+
+    #
+    # Sanity-checks on the collected data
+    #
+    # assert that getpage counts roughly match between compute and ps
+    assert metrics.pageserver_getpage_count == pytest.approx(
+        metrics.compute_getpage_count, rel=0.01
+    )
+
+    #
+    # Record the results
+    #
+
+    for metric, value in dataclasses.asdict(metrics).items():
+        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+
+    zenbenchmark.record(
+        "perfmetric.batching_factor",
+        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+@pytest.mark.parametrize(
+    "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
+)
+def test_timer_precision(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    batch_timeout: str | None,
+):
+    """
+    Determine the batching timeout precision (mean latency) and tail latency impact.
+
+    The baseline is `None`; an ideal batching timeout implementation would increase
+    the mean latency by exactly `batch_timeout`.
+
+    That is not the case with the current implementation, will be addressed in future changes.
+    """
+
+    #
+    # Setup
+    #
+
+    def patch_ps_config(ps_config):
+        ps_config["server_side_batch_timeout"] = batch_timeout
+
+    neon_env_builder.pageserver_config_override = patch_ps_config
+
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
+    cur.execute("SET effective_io_concurrency=1")
+
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
+    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
+
+    log.info("Filling the table")
+    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
+    tablesize = 50 * 1024 * 1024
+    npages = tablesize // (8 * 1024)
+    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
+    # TODO: can we force postgres to do sequential scans?
+
+    cur.close()
+    conn.close()
+
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    endpoint.stop()
+
+    for sk in env.safekeepers:
+        sk.stop()
+
+    #
+    # Run single-threaded pagebench (TODO: dedup with other benchmark code)
+    #
+
+    env.pageserver.allowed_errors.append(
+        # https://github.com/neondatabase/neon/issues/6925
+        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--num-clients",
+        "1",
+        "--runtime",
+        "10s",
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path) as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "latency_mean"
+    zenbenchmark.record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        zenbenchmark.record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )

From 7a2f0ed8d452cd05bb1b9a85f1cda6e00ead1f85 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 25 Nov 2024 17:29:28 +0000
Subject: [PATCH 06/29] safekeeper: lift decoding and interpretation of WAL to
 the safekeeper (#9746)

## Problem

For any given tenant shard, pageservers receive all of the tenant's WAL
from the safekeeper.
This soft-blocks us from using larger shard counts due to bandwidth
concerns and CPU overhead of filtering
out the records.

## Summary of changes

This PR lifts the decoding and interpretation of WAL from the pageserver
into the safekeeper.

A customised PG replication protocol is used where instead of sending
raw WAL, the safekeeper sends
filtered, interpreted records. The receiver drives the protocol
selection, so, on the pageserver side, usage
of the new protocol is gated by a new pageserver config:
`wal_receiver_protocol`.

 More granularly the changes are:
1. Optionally inject the protocol and shard identity into the arguments
used for starting replication
2. On the safekeeper side, implement a new wal sending primitive which
decodes and interprets records
 before sending them over
3. On the pageserver side, implement the ingestion of this new
replication message type. It's very similar
 to what we already have for raw wal (minus decoding and interpreting).

 ## Notes

* This PR currently uses my [branch of
rust-postgres](https://github.com/neondatabase/rust-postgres/tree/vlad/interpreted-wal-record-replication-support)
which includes the deserialization logic for the new replication message
type. PR for that is open
[here](https://github.com/neondatabase/rust-postgres/pull/32).
* This PR contains changes for both pageservers and safekeepers. It's
safe to merge because the new protocol is disabled by default on the
pageserver side. We can gradually start enabling it in subsequent
releases.
* CI tests are running on https://github.com/neondatabase/neon/pull/9747

 ## Links

 Related: https://github.com/neondatabase/neon/issues/9336
 Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |  11 +-
 libs/pageserver_api/src/config.rs             |   7 +-
 libs/pq_proto/src/lib.rs                      |  36 +++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/postgres_client.rs             |  95 +++++++++--
 libs/wal_decoder/src/models.rs                |  12 ++
 libs/wal_decoder/src/serialized_batch.rs      |  15 +-
 pageserver/src/bin/pageserver.rs              |   1 +
 pageserver/src/config.rs                      |   5 +
 pageserver/src/pgdatadir_mapping.rs           |   7 +-
 pageserver/src/tenant/timeline.rs             |   3 +-
 pageserver/src/tenant/timeline/walreceiver.rs |   2 +
 .../walreceiver/connection_manager.rs         |  40 +++--
 .../walreceiver/walreceiver_connection.rs     | 149 ++++++++++++++++--
 safekeeper/Cargo.toml                         |   2 +
 safekeeper/src/handler.rs                     |  79 ++++++++++
 safekeeper/src/lib.rs                         |   2 +
 safekeeper/src/recovery.rs                    |  13 +-
 safekeeper/src/send_interpreted_wal.rs        | 121 ++++++++++++++
 safekeeper/src/send_wal.rs                    | 122 +++++++++++---
 safekeeper/src/wal_reader_stream.rs           | 149 ++++++++++++++++++
 .../performance/test_sharded_ingest.py        |  50 +++++-
 test_runner/regress/test_compaction.py        |   7 +-
 test_runner/regress/test_crafted_wal_end.py   |   9 +-
 test_runner/regress/test_subxacts.py          |  12 +-
 .../regress/test_wal_acceptor_async.py        |   6 +-
 26 files changed, 870 insertions(+), 86 deletions(-)
 create mode 100644 safekeeper/src/send_interpreted_wal.rs
 create mode 100644 safekeeper/src/wal_reader_stream.rs

diff --git a/Cargo.lock b/Cargo.lock
index 98d2e0864a..c1a14210de 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4133,7 +4133,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4146,7 +4146,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4165,7 +4165,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -5364,6 +5364,7 @@ dependencies = [
  "itertools 0.10.5",
  "metrics",
  "once_cell",
+ "pageserver_api",
  "parking_lot 0.12.1",
  "postgres",
  "postgres-protocol",
@@ -5395,6 +5396,7 @@ dependencies = [
  "tracing-subscriber",
  "url",
  "utils",
+ "wal_decoder",
  "walproposer",
  "workspace_hack",
 ]
@@ -6466,7 +6468,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -7021,6 +7023,7 @@ dependencies = [
  "serde_assert",
  "serde_json",
  "serde_path_to_error",
+ "serde_with",
  "signal-hook",
  "strum",
  "strum_macros",
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 7666728427..0abca5cdc2 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -18,7 +18,7 @@ use std::{
     str::FromStr,
     time::Duration,
 };
-use utils::logging::LogFormat;
+use utils::{logging::LogFormat, postgres_client::PostgresClientProtocol};
 
 use crate::models::ImageCompressionAlgorithm;
 use crate::models::LsnLease;
@@ -120,6 +120,7 @@ pub struct ConfigToml {
     pub no_sync: Option<bool>,
     #[serde(with = "humantime_serde")]
     pub server_side_batch_timeout: Option<Duration>,
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -330,6 +331,9 @@ pub mod defaults {
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
     pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
+
+    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
+        utils::postgres_client::PostgresClientProtocol::Vanilla;
 }
 
 impl Default for ConfigToml {
@@ -418,6 +422,7 @@ impl Default for ConfigToml {
                 .map(|duration| humantime::parse_duration(duration).unwrap()),
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
+            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
         }
     }
 }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 6c40968496..b7871ab01f 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -562,6 +562,9 @@ pub enum BeMessage<'a> {
         options: &'a [&'a str],
     },
     KeepAlive(WalSndKeepAlive),
+    /// Batch of interpreted, shard filtered WAL records,
+    /// ready for the pageserver to ingest
+    InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
 }
 
 /// Common shorthands.
@@ -672,6 +675,25 @@ pub struct WalSndKeepAlive {
     pub request_reply: bool,
 }
 
+/// Batch of interpreted WAL records used in the interpreted
+/// safekeeper to pageserver protocol.
+///
+/// Note that the pageserver uses the RawInterpretedWalRecordsBody
+/// counterpart of this from the neondatabase/rust-postgres repo.
+/// If you're changing this struct, you likely need to change its
+/// twin as well.
+#[derive(Debug)]
+pub struct InterpretedWalRecordsBody<'a> {
+    /// End of raw WAL in [`Self::data`]
+    pub streaming_lsn: u64,
+    /// Current end of WAL on the server
+    pub commit_lsn: u64,
+    /// Start LSN of the next record in PG WAL.
+    /// Is 0 if the portion of PG WAL did not contain any records.
+    pub next_record_lsn: u64,
+    pub data: &'a [u8],
+}
+
 pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);
 
 // single text column
@@ -996,6 +1018,20 @@ impl BeMessage<'_> {
                     Ok(())
                 })?
             }
+
+            BeMessage::InterpretedWalRecords(rec) => {
+                // We use the COPY_DATA_TAG for our custom message
+                // since this tag is interpreted as raw bytes.
+                buf.put_u8(b'd');
+                write_body(buf, |buf| {
+                    buf.put_u8(b'0'); // matches INTERPRETED_WAL_RECORD_TAG in postgres-protocol
+                                      // dependency
+                    buf.put_u64(rec.streaming_lsn);
+                    buf.put_u64(rec.commit_lsn);
+                    buf.put_u64(rec.next_record_lsn);
+                    buf.put_slice(rec.data);
+                });
+            }
         }
         Ok(())
     }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 4aad0aee2c..f440b81d8f 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -33,6 +33,7 @@ pprof.workspace = true
 regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs
index dba74f5b0b..3073bbde4c 100644
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -7,29 +7,94 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
 
 use crate::id::TenantTimelineId;
 
+/// Postgres client protocol types
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum PostgresClientProtocol {
+    /// Usual Postgres replication protocol
+    Vanilla,
+    /// Custom shard-aware protocol that replicates interpreted records.
+    /// Used to send wal from safekeeper to pageserver.
+    Interpreted,
+}
+
+impl TryFrom<u8> for PostgresClientProtocol {
+    type Error = u8;
+
+    fn try_from(value: u8) -> Result<Self, Self::Error> {
+        Ok(match value {
+            v if v == (PostgresClientProtocol::Vanilla as u8) => PostgresClientProtocol::Vanilla,
+            v if v == (PostgresClientProtocol::Interpreted as u8) => {
+                PostgresClientProtocol::Interpreted
+            }
+            x => return Err(x),
+        })
+    }
+}
+
+pub struct ConnectionConfigArgs<'a> {
+    pub protocol: PostgresClientProtocol,
+
+    pub ttid: TenantTimelineId,
+    pub shard_number: Option<u8>,
+    pub shard_count: Option<u8>,
+    pub shard_stripe_size: Option<u32>,
+
+    pub listen_pg_addr_str: &'a str,
+
+    pub auth_token: Option<&'a str>,
+    pub availability_zone: Option<&'a str>,
+}
+
+impl<'a> ConnectionConfigArgs<'a> {
+    fn options(&'a self) -> Vec<String> {
+        let mut options = vec![
+            "-c".to_owned(),
+            format!("timeline_id={}", self.ttid.timeline_id),
+            format!("tenant_id={}", self.ttid.tenant_id),
+            format!("protocol={}", self.protocol as u8),
+        ];
+
+        if self.shard_number.is_some() {
+            assert!(self.shard_count.is_some());
+            assert!(self.shard_stripe_size.is_some());
+
+            options.push(format!("shard_count={}", self.shard_count.unwrap()));
+            options.push(format!("shard_number={}", self.shard_number.unwrap()));
+            options.push(format!(
+                "shard_stripe_size={}",
+                self.shard_stripe_size.unwrap()
+            ));
+        }
+
+        options
+    }
+}
+
 /// Create client config for fetching WAL from safekeeper on particular timeline.
 /// listen_pg_addr_str is in form host:\[port\].
 pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
+    args: ConnectionConfigArgs,
 ) -> anyhow::Result<PgConnectionConfig> {
     let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+        parse_host_port(args.listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
     let port = port.unwrap_or(5432);
     let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
+        .extend_options(args.options())
+        .set_password(args.auth_token.map(|s| s.to_owned()));
 
-    if let Some(availability_zone) = availability_zone {
+    if let Some(availability_zone) = args.availability_zone {
         connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
     }
 
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index c69f8c869a..7ac425cb5f 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -65,6 +65,18 @@ pub struct InterpretedWalRecord {
     pub xid: TransactionId,
 }
 
+impl InterpretedWalRecord {
+    /// Checks if the WAL record is empty
+    ///
+    /// An empty interpreted WAL record has no data or metadata and does not have to be sent to the
+    /// pageserver.
+    pub fn is_empty(&self) -> bool {
+        self.batch.is_empty()
+            && self.metadata_record.is_none()
+            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
+    }
+}
+
 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
 #[derive(Serialize, Deserialize)]
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index 9c0708ebbe..41294da7a0 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -496,11 +496,16 @@ impl SerializedValueBatch {
         }
     }
 
-    /// Checks if the batch is empty
-    ///
-    /// A batch is empty when it contains no serialized values.
-    /// Note that it may still contain observed values.
+    /// Checks if the batch contains any serialized or observed values
     pub fn is_empty(&self) -> bool {
+        !self.has_data() && self.metadata.is_empty()
+    }
+
+    /// Checks if the batch contains data
+    ///
+    /// Note that if this returns false, it may still contain observed values or
+    /// a metadata record.
+    pub fn has_data(&self) -> bool {
         let empty = self.raw.is_empty();
 
         if cfg!(debug_assertions) && empty {
@@ -510,7 +515,7 @@ impl SerializedValueBatch {
                 .all(|meta| matches!(meta, ValueMeta::Observed(_))));
         }
 
-        empty
+        !empty
     }
 
     /// Returns the number of values serialized in the batch
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 033a9a4619..a8c2c2e992 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> {
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
+    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 59ea6fb941..2cf237e72b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,6 +14,7 @@ use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
 use utils::logging::SecretString;
+use utils::postgres_client::PostgresClientProtocol;
 
 use once_cell::sync::OnceCell;
 use reqwest::Url;
@@ -190,6 +191,8 @@ pub struct PageServerConf {
     /// Maximum amount of time for which a get page request request
     /// might be held up for request merging.
     pub server_side_batch_timeout: Option<Duration>,
+
+    pub wal_receiver_protocol: PostgresClientProtocol,
 }
 
 /// Token for authentication to safekeepers
@@ -350,6 +353,7 @@ impl PageServerConf {
             server_side_batch_timeout,
             tenant_config,
             no_sync,
+            wal_receiver_protocol,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -393,6 +397,7 @@ impl PageServerConf {
             import_pgdata_upcall_api,
             import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
             import_pgdata_aws_endpoint_url,
+            wal_receiver_protocol,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f4f184be5a..c491bfe650 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1229,10 +1229,9 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn has_dirty_data(&self) -> bool {
-        !self
-            .pending_data_batch
+        self.pending_data_batch
             .as_ref()
-            .map_or(true, |b| b.is_empty())
+            .map_or(false, |b| b.has_data())
     }
 
     /// Set the current lsn
@@ -1408,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
             Some(pending_batch) => {
                 pending_batch.extend(batch);
             }
-            None if !batch.is_empty() => {
+            None if batch.has_data() => {
                 self.pending_data_batch = Some(batch);
             }
             None => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4881be33a6..f6a06e73a7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2470,6 +2470,7 @@ impl Timeline {
         *guard = Some(WalReceiver::start(
             Arc::clone(self),
             WalReceiverConf {
+                protocol: self.conf.wal_receiver_protocol,
                 wal_connect_timeout,
                 lagging_wal_timeout,
                 max_lsn_wal_lag,
@@ -5896,7 +5897,7 @@ impl<'a> TimelineWriter<'a> {
         batch: SerializedValueBatch,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        if batch.is_empty() {
+        if !batch.has_data() {
             return Ok(());
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 4a3a5c621b..f831f5e48a 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -38,6 +38,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::postgres_client::PostgresClientProtocol;
 
 use self::connection_manager::ConnectionManagerStatus;
 
@@ -45,6 +46,7 @@ use super::Timeline;
 
 #[derive(Clone)]
 pub struct WalReceiverConf {
+    pub protocol: PostgresClientProtocol,
     /// The timeout on the connection to safekeeper for WAL streaming.
     pub wal_connect_timeout: Duration,
     /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index de50f217d8..7a64703a30 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -36,7 +36,9 @@ use postgres_connection::PgConnectionConfig;
 use utils::backoff::{
     exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
+use utils::postgres_client::{
+    wal_stream_connection_config, ConnectionConfigArgs, PostgresClientProtocol,
+};
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
@@ -984,15 +986,33 @@ impl ConnectionManagerState {
                 if info.safekeeper_connstr.is_empty() {
                     return None; // no connection string, ignore sk
                 }
-                match wal_stream_connection_config(
-                    self.id,
-                    info.safekeeper_connstr.as_ref(),
-                    match &self.conf.auth_token {
-                        None => None,
-                        Some(x) => Some(x),
+
+                let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
+                    PostgresClientProtocol::Vanilla => {
+                        (None, None, None)
                     },
-                    self.conf.availability_zone.as_deref(),
-                ) {
+                    PostgresClientProtocol::Interpreted => {
+                        let shard_identity = self.timeline.get_shard_identity();
+                        (
+                            Some(shard_identity.number.0),
+                            Some(shard_identity.count.0),
+                            Some(shard_identity.stripe_size.0),
+                        )
+                    }
+                };
+
+                let connection_conf_args = ConnectionConfigArgs {
+                    protocol: self.conf.protocol,
+                    ttid: self.id,
+                    shard_number,
+                    shard_count,
+                    shard_stripe_size,
+                    listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
+                    auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
+                    availability_zone: self.conf.availability_zone.as_deref()
+                };
+
+                match wal_stream_connection_config(connection_conf_args) {
                     Ok(connstr) => Some((*sk_id, info, connstr)),
                     Err(e) => {
                         error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id);
@@ -1096,6 +1116,7 @@ impl ReconnectReason {
 mod tests {
     use super::*;
     use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
     use url::Host;
 
     fn dummy_broker_sk_timeline(
@@ -1532,6 +1553,7 @@ mod tests {
             timeline,
             cancel: CancellationToken::new(),
             conf: WalReceiverConf {
+                protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
                 wal_connect_timeout: Duration::from_secs(1),
                 lagging_wal_timeout: Duration::from_secs(1),
                 max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 6ac6920d47..1a0e66ceb3 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,7 +36,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{id::NodeId, lsn::Lsn};
+use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
@@ -291,6 +291,15 @@ pub(super) async fn handle_walreceiver_connection(
                 connection_status.latest_connection_update = now;
                 connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
             }
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                connection_status.latest_connection_update = now;
+                if !raw.data().is_empty() {
+                    connection_status.latest_wal_update = now;
+                }
+
+                connection_status.commit_lsn = Some(Lsn::from(raw.commit_lsn()));
+                connection_status.streaming_lsn = Some(Lsn::from(raw.streaming_lsn()));
+            }
             &_ => {}
         };
         if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
@@ -298,7 +307,130 @@ pub(super) async fn handle_walreceiver_connection(
             return Ok(());
         }
 
+        async fn commit(
+            modification: &mut DatadirModification<'_>,
+            uncommitted: &mut u64,
+            filtered: &mut u64,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<()> {
+            WAL_INGEST
+                .records_committed
+                .inc_by(*uncommitted - *filtered);
+            modification.commit(ctx).await?;
+            *uncommitted = 0;
+            *filtered = 0;
+            Ok(())
+        }
+
         let status_update = match replication_message {
+            ReplicationMessage::RawInterpretedWalRecords(raw) => {
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
+
+                let mut uncommitted_records = 0;
+                let mut filtered_records = 0;
+
+                // This is the end LSN of the raw WAL from which the records
+                // were interpreted.
+                let streaming_lsn = Lsn::from(raw.streaming_lsn());
+                tracing::debug!(
+                    "Received WAL up to {streaming_lsn} with next_record_lsn={}",
+                    Lsn(raw.next_record_lsn().unwrap_or(0))
+                );
+
+                let records = Vec::<InterpretedWalRecord>::des(raw.data()).with_context(|| {
+                    anyhow::anyhow!(
+                        "Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
+                    )
+                })?;
+
+                // We start the modification at 0 because each interpreted record
+                // advances it to its end LSN. 0 is just an initialization placeholder.
+                let mut modification = timeline.begin_modification(Lsn(0));
+
+                for interpreted in records {
+                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
+                        && uncommitted_records > 0
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+
+                    let next_record_lsn = interpreted.next_record_lsn;
+                    let ingested = walingest
+                        .ingest_record(interpreted, &mut modification, &ctx)
+                        .await
+                        .with_context(|| format!("could not ingest record at {next_record_lsn}"))?;
+
+                    if !ingested {
+                        tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
+                        WAL_INGEST.records_filtered.inc();
+                        filtered_records += 1;
+                    }
+
+                    uncommitted_records += 1;
+
+                    // FIXME: this cannot be made pausable_failpoint without fixing the
+                    // failpoint library; in tests, the added amount of debugging will cause us
+                    // to timeout the tests.
+                    fail_point!("walreceiver-after-ingest");
+
+                    // Commit every ingest_batch_size records. Even if we filtered out
+                    // all records, we still need to call commit to advance the LSN.
+                    if uncommitted_records >= ingest_batch_size
+                        || modification.approx_pending_bytes()
+                            > DatadirModification::MAX_PENDING_BYTES
+                    {
+                        commit(
+                            &mut modification,
+                            &mut uncommitted_records,
+                            &mut filtered_records,
+                            &ctx,
+                        )
+                        .await?;
+                    }
+                }
+
+                // Records might have been filtered out on the safekeeper side, but we still
+                // need to advance last record LSN on all shards. If we've not ingested the latest
+                // record, then set the LSN of the modification past it. This way all shards
+                // advance their last record LSN at the same time.
+                let needs_last_record_lsn_advance = match raw.next_record_lsn().map(Lsn::from) {
+                    Some(lsn) if lsn > modification.get_lsn() => {
+                        modification.set_lsn(lsn).unwrap();
+                        true
+                    }
+                    _ => false,
+                };
+
+                if uncommitted_records > 0 || needs_last_record_lsn_advance {
+                    // Commit any uncommitted records
+                    commit(
+                        &mut modification,
+                        &mut uncommitted_records,
+                        &mut filtered_records,
+                        &ctx,
+                    )
+                    .await?;
+                }
+
+                if !caught_up && streaming_lsn >= end_of_wal {
+                    info!("caught up at LSN {streaming_lsn}");
+                    caught_up = true;
+                }
+
+                tracing::debug!(
+                    "Ingested WAL up to {streaming_lsn}. Last record LSN is {}",
+                    timeline.get_last_record_lsn()
+                );
+
+                Some(streaming_lsn)
+            }
+
             ReplicationMessage::XLogData(xlog_data) => {
                 // Pass the WAL data to the decoder, and see if we can decode
                 // more records as a result.
@@ -316,21 +448,6 @@ pub(super) async fn handle_walreceiver_connection(
                     let mut uncommitted_records = 0;
                     let mut filtered_records = 0;
 
-                    async fn commit(
-                        modification: &mut DatadirModification<'_>,
-                        uncommitted: &mut u64,
-                        filtered: &mut u64,
-                        ctx: &RequestContext,
-                    ) -> anyhow::Result<()> {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(*uncommitted - *filtered);
-                        modification.commit(ctx).await?;
-                        *uncommitted = 0;
-                        *filtered = 0;
-                        Ok(())
-                    }
-
                     while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index ab77b63d54..635a9222e1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -28,6 +28,7 @@ hyper0.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
+pageserver_api.workspace = true
 postgres.workspace = true
 postgres-protocol.workspace = true
 pprof.workspace = true
@@ -58,6 +59,7 @@ sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
+wal_decoder.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 3f00b69cde..cec7c3c7ee 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,11 +2,15 @@
 //! protocol commands.
 
 use anyhow::Context;
+use pageserver_api::models::ShardParameters;
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use std::future::Future;
 use std::str::{self, FromStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{debug, info, info_span, Instrument};
+use utils::postgres_client::PostgresClientProtocol;
+use utils::shard::{ShardCount, ShardNumber};
 
 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -35,6 +39,8 @@ pub struct SafekeeperPostgresHandler {
     pub tenant_id: Option<TenantId>,
     pub timeline_id: Option<TimelineId>,
     pub ttid: TenantTimelineId,
+    pub shard: Option<ShardIdentity>,
+    pub protocol: Option<PostgresClientProtocol>,
     /// Unique connection id is logged in spans for observability.
     pub conn_id: ConnectionId,
     /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
@@ -107,11 +113,28 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
     ) -> Result<(), QueryError> {
         if let FeStartupPacket::StartupMessage { params, .. } = sm {
             if let Some(options) = params.options_raw() {
+                let mut shard_count: Option<u8> = None;
+                let mut shard_number: Option<u8> = None;
+                let mut shard_stripe_size: Option<u32> = None;
+
                 for opt in options {
                     // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy,
                     // remove these after the PR gets deployed:
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
+                        Some(("protocol", value)) => {
+                            let raw_value = value
+                                .parse::<u8>()
+                                .with_context(|| format!("Failed to parse {value} as protocol"))?;
+
+                            self.protocol = Some(
+                                PostgresClientProtocol::try_from(raw_value).map_err(|_| {
+                                    QueryError::Other(anyhow::anyhow!(
+                                        "Unexpected client protocol type: {raw_value}"
+                                    ))
+                                })?,
+                            );
+                        }
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
                             self.tenant_id = Some(value.parse().with_context(|| {
                                 format!("Failed to parse {value} as tenant id")
@@ -127,9 +150,54 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                                 metrics.set_client_az(client_az)
                             }
                         }
+                        Some(("shard_count", value)) => {
+                            shard_count = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard count")
+                            })?);
+                        }
+                        Some(("shard_number", value)) => {
+                            shard_number = Some(value.parse::<u8>().with_context(|| {
+                                format!("Failed to parse {value} as shard number")
+                            })?);
+                        }
+                        Some(("shard_stripe_size", value)) => {
+                            shard_stripe_size = Some(value.parse::<u32>().with_context(|| {
+                                format!("Failed to parse {value} as shard stripe size")
+                            })?);
+                        }
                         _ => continue,
                     }
                 }
+
+                match self.protocol() {
+                    PostgresClientProtocol::Vanilla => {
+                        if shard_count.is_some()
+                            || shard_number.is_some()
+                            || shard_stripe_size.is_some()
+                        {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Shard params specified for vanilla protocol"
+                            )));
+                        }
+                    }
+                    PostgresClientProtocol::Interpreted => {
+                        match (shard_count, shard_number, shard_stripe_size) {
+                            (Some(count), Some(number), Some(stripe_size)) => {
+                                let params = ShardParameters {
+                                    count: ShardCount(count),
+                                    stripe_size: ShardStripeSize(stripe_size),
+                                };
+                                self.shard =
+                                    Some(ShardIdentity::from_params(ShardNumber(number), &params));
+                            }
+                            _ => {
+                                return Err(QueryError::Other(anyhow::anyhow!(
+                                    "Shard params were not specified"
+                                )));
+                            }
+                        }
+                    }
+                }
             }
 
             if let Some(app_name) = params.get("application_name") {
@@ -150,6 +218,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     tracing::field::debug(self.appname.clone()),
                 );
 
+            if let Some(shard) = self.shard.as_ref() {
+                tracing::Span::current()
+                    .record("shard", tracing::field::display(shard.shard_slug()));
+            }
+
             Ok(())
         } else {
             Err(QueryError::Other(anyhow::anyhow!(
@@ -258,6 +331,8 @@ impl SafekeeperPostgresHandler {
             tenant_id: None,
             timeline_id: None,
             ttid: TenantTimelineId::empty(),
+            shard: None,
+            protocol: None,
             conn_id,
             claims: None,
             auth,
@@ -265,6 +340,10 @@ impl SafekeeperPostgresHandler {
         }
     }
 
+    pub fn protocol(&self) -> PostgresClientProtocol {
+        self.protocol.unwrap_or(PostgresClientProtocol::Vanilla)
+    }
+
     // when accessing management api supply None as an argument
     // when using to authorize tenant pass corresponding tenant id
     fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 6d68b6b59b..abe6e00a66 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -29,6 +29,7 @@ pub mod receive_wal;
 pub mod recovery;
 pub mod remove_wal;
 pub mod safekeeper;
+pub mod send_interpreted_wal;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
@@ -38,6 +39,7 @@ pub mod timeline_manager;
 pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
+pub mod wal_reader_stream;
 pub mod wal_service;
 pub mod wal_storage;
 
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 9c4149d8f1..7b87166aa0 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -17,6 +17,7 @@ use tokio::{
 use tokio_postgres::replication::ReplicationStream;
 use tokio_postgres::types::PgLsn;
 use tracing::*;
+use utils::postgres_client::{ConnectionConfigArgs, PostgresClientProtocol};
 use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
@@ -325,7 +326,17 @@ async fn recovery_stream(
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
     // TODO: pass auth token
-    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
+    let connection_conf_args = ConnectionConfigArgs {
+        protocol: PostgresClientProtocol::Vanilla,
+        ttid: tli.ttid,
+        shard_number: None,
+        shard_count: None,
+        shard_stripe_size: None,
+        listen_pg_addr_str: &donor.pg_connstr,
+        auth_token: None,
+        availability_zone: None,
+    };
+    let cfg = wal_stream_connection_config(connection_conf_args)?;
     let mut cfg = cfg.to_tokio_postgres_config();
     // It will make safekeeper give out not committed WAL (up to flush_lsn).
     cfg.application_name(&format!("safekeeper_{}", conf.my_id));
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
new file mode 100644
index 0000000000..cf0ee276e9
--- /dev/null
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -0,0 +1,121 @@
+use std::time::Duration;
+
+use anyhow::Context;
+use futures::StreamExt;
+use pageserver_api::shard::ShardIdentity;
+use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
+use postgres_ffi::MAX_SEND_SIZE;
+use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
+use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::time::MissedTickBehavior;
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+use wal_decoder::models::InterpretedWalRecord;
+
+use crate::send_wal::EndWatchView;
+use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
+
+/// Shard-aware interpreted record sender.
+/// This is used for sending WAL to the pageserver. Said WAL
+/// is pre-interpreted and filtered for the shard.
+pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) pgb: &'a mut PostgresBackend<IO>,
+    pub(crate) wal_stream_builder: WalReaderStreamBuilder,
+    pub(crate) end_watch_view: EndWatchView,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) pg_version: u32,
+    pub(crate) appname: Option<String>,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
+    /// Send interpreted WAL to a receiver.
+    /// Stops when an error occurs or the receiver is caught up and there's no active compute.
+    ///
+    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
+    /// convenience.
+    pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut wal_position = self.wal_stream_builder.start_pos();
+        let mut wal_decoder =
+            WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
+
+        let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
+        let mut stream = std::pin::pin!(stream);
+
+        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
+        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        keepalive_ticker.reset();
+
+        loop {
+            tokio::select! {
+                // Get some WAL from the stream and then: decode, interpret and send it
+                wal = stream.next() => {
+                    let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
+                        Some(some) => some?,
+                        None => { break; }
+                    };
+
+                    wal_position = wal_end_lsn;
+                    wal_decoder.feed_bytes(&wal);
+
+                    let mut records = Vec::new();
+                    let mut max_next_record_lsn = None;
+                    while let Some((next_record_lsn, recdata)) = wal_decoder
+                        .poll_decode()
+                        .with_context(|| "Failed to decode WAL")?
+                    {
+                        assert!(next_record_lsn.is_aligned());
+                        max_next_record_lsn = Some(next_record_lsn);
+
+                        // Deserialize and interpret WAL record
+                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                            recdata,
+                            &self.shard,
+                            next_record_lsn,
+                            self.pg_version,
+                        )
+                        .with_context(|| "Failed to interpret WAL")?;
+
+                        if !interpreted.is_empty() {
+                            records.push(interpreted);
+                        }
+                    }
+
+                    let mut buf = Vec::new();
+                    records
+                        .ser_into(&mut buf)
+                        .with_context(|| "Failed to serialize interpreted WAL")?;
+
+                    // Reset the keep alive ticker since we are sending something
+                    // over the wire now.
+                    keepalive_ticker.reset();
+
+                    self.pgb
+                        .write_message(&BeMessage::InterpretedWalRecords(InterpretedWalRecordsBody {
+                            streaming_lsn: wal_end_lsn.0,
+                            commit_lsn: available_wal_end_lsn.0,
+                            next_record_lsn: max_next_record_lsn.unwrap_or(Lsn::INVALID).0,
+                            data: buf.as_slice(),
+                        })).await?;
+                }
+
+                // Send a periodic keep alive when the connection has been idle for a while.
+                _ = keepalive_ticker.tick() => {
+                    self.pgb
+                        .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                            wal_end: self.end_watch_view.get().0,
+                            timestamp: get_current_timestamp(),
+                            request_reply: true,
+                        }))
+                        .await?;
+                }
+            }
+        }
+
+        // The loop above ends when the receiver is caught up and there's no more WAL to send.
+        Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+            "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
+            self.appname, wal_position,
+        )))
+    }
+}
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index aa65ec851b..1acfcad418 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,12 +5,15 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
+use crate::send_interpreted_wal::InterpretedWalSender;
 use crate::timeline::WalResidentTimeline;
+use crate::wal_reader_stream::WalReaderStreamBuilder;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
+use futures::future::Either;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -22,6 +25,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
+use utils::postgres_client::PostgresClientProtocol;
 
 use std::cmp::{max, min};
 use std::net::SocketAddr;
@@ -226,7 +230,7 @@ impl WalSenders {
 
     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
     /// client is not pageserver.
-    fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
+    pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
         let shared = self.mutex.lock();
         let slot = shared.get_slot(id);
         match slot.feedback {
@@ -370,6 +374,16 @@ pub struct WalSenderGuard {
     walsenders: Arc<WalSenders>,
 }
 
+impl WalSenderGuard {
+    pub fn id(&self) -> WalSenderId {
+        self.id
+    }
+
+    pub fn walsenders(&self) -> &Arc<WalSenders> {
+        &self.walsenders
+    }
+}
+
 impl Drop for WalSenderGuard {
     fn drop(&mut self) {
         self.walsenders.unregister(self.id);
@@ -440,11 +454,12 @@ impl SafekeeperPostgresHandler {
         }
 
         info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={}",
             start_pos,
             end_pos,
             matches!(end_watch, EndWatch::Flush(_)),
-            appname
+            appname,
+            self.protocol(),
         );
 
         // switch to copy
@@ -456,21 +471,51 @@ impl SafekeeperPostgresHandler {
         // not synchronized with sends, so this avoids deadlocks.
         let reader = pgb.split().context("START_REPLICATION split")?;
 
+        let send_fut = match self.protocol() {
+            PostgresClientProtocol::Vanilla => {
+                let sender = WalSender {
+                    pgb,
+                    // should succeed since we're already holding another guard
+                    tli: tli.wal_residence_guard().await?,
+                    appname,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    ws_guard: ws_guard.clone(),
+                    wal_reader,
+                    send_buf: vec![0u8; MAX_SEND_SIZE],
+                };
+
+                Either::Left(sender.run())
+            }
+            PostgresClientProtocol::Interpreted => {
+                let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
+                let end_watch_view = end_watch.view();
+                let wal_stream_builder = WalReaderStreamBuilder {
+                    tli: tli.wal_residence_guard().await?,
+                    start_pos,
+                    end_pos,
+                    term,
+                    end_watch,
+                    wal_sender_guard: ws_guard.clone(),
+                };
+
+                let sender = InterpretedWalSender {
+                    pgb,
+                    wal_stream_builder,
+                    end_watch_view,
+                    shard: self.shard.unwrap(),
+                    pg_version,
+                    appname,
+                };
+
+                Either::Right(sender.run())
+            }
+        };
+
         let tli_cancel = tli.cancel.clone();
 
-        let mut sender = WalSender {
-            pgb,
-            // should succeed since we're already holding another guard
-            tli: tli.wal_residence_guard().await?,
-            appname,
-            start_pos,
-            end_pos,
-            term,
-            end_watch,
-            ws_guard: ws_guard.clone(),
-            wal_reader,
-            send_buf: vec![0u8; MAX_SEND_SIZE],
-        };
         let mut reply_reader = ReplyReader {
             reader,
             ws_guard: ws_guard.clone(),
@@ -479,7 +524,7 @@ impl SafekeeperPostgresHandler {
 
         let res = tokio::select! {
             // todo: add read|write .context to these errors
-            r = sender.run() => r,
+            r = send_fut => r,
             r = reply_reader.run() => r,
             _ = tli_cancel.cancelled() => {
                 return Err(CopyStreamHandlerEnd::Cancelled);
@@ -504,16 +549,22 @@ impl SafekeeperPostgresHandler {
     }
 }
 
+/// TODO(vlad): maybe lift this instead
 /// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
 /// given term (recovery by walproposer or peer safekeeper).
-enum EndWatch {
+#[derive(Clone)]
+pub(crate) enum EndWatch {
     Commit(Receiver<Lsn>),
     Flush(Receiver<TermLsn>),
 }
 
 impl EndWatch {
+    pub(crate) fn view(&self) -> EndWatchView {
+        EndWatchView(self.clone())
+    }
+
     /// Get current end of WAL.
-    fn get(&self) -> Lsn {
+    pub(crate) fn get(&self) -> Lsn {
         match self {
             EndWatch::Commit(r) => *r.borrow(),
             EndWatch::Flush(r) => r.borrow().lsn,
@@ -521,15 +572,44 @@ impl EndWatch {
     }
 
     /// Wait for the update.
-    async fn changed(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn changed(&mut self) -> anyhow::Result<()> {
         match self {
             EndWatch::Commit(r) => r.changed().await?,
             EndWatch::Flush(r) => r.changed().await?,
         }
         Ok(())
     }
+
+    pub(crate) async fn wait_for_lsn(
+        &mut self,
+        lsn: Lsn,
+        client_term: Option<Term>,
+    ) -> anyhow::Result<Lsn> {
+        loop {
+            let end_pos = self.get();
+            if end_pos > lsn {
+                return Ok(end_pos);
+            }
+            if let EndWatch::Flush(rx) = &self {
+                let curr_term = rx.borrow().term;
+                if let Some(client_term) = client_term {
+                    if curr_term != client_term {
+                        bail!("term changed: requested {}, now {}", client_term, curr_term);
+                    }
+                }
+            }
+            self.changed().await?;
+        }
+    }
 }
 
+pub(crate) struct EndWatchView(EndWatch);
+
+impl EndWatchView {
+    pub(crate) fn get(&self) -> Lsn {
+        self.0.get()
+    }
+}
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
@@ -566,7 +646,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     ///
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
-    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
+    async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
         loop {
             // Wait for the next portion if it is not there yet, or just
             // update our end of WAL available for sending value, we
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
new file mode 100644
index 0000000000..f8c0c502cd
--- /dev/null
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -0,0 +1,149 @@
+use std::sync::Arc;
+
+use async_stream::try_stream;
+use bytes::Bytes;
+use futures::Stream;
+use postgres_backend::CopyStreamHandlerEnd;
+use std::time::Duration;
+use tokio::time::timeout;
+use utils::lsn::Lsn;
+
+use crate::{
+    safekeeper::Term,
+    send_wal::{EndWatch, WalSenderGuard},
+    timeline::WalResidentTimeline,
+};
+
+pub(crate) struct WalReaderStreamBuilder {
+    pub(crate) tli: WalResidentTimeline,
+    pub(crate) start_pos: Lsn,
+    pub(crate) end_pos: Lsn,
+    pub(crate) term: Option<Term>,
+    pub(crate) end_watch: EndWatch,
+    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
+}
+
+impl WalReaderStreamBuilder {
+    pub(crate) fn start_pos(&self) -> Lsn {
+        self.start_pos
+    }
+}
+
+pub(crate) struct WalBytes {
+    /// Raw PG WAL
+    pub(crate) wal: Bytes,
+    /// Start LSN of [`Self::wal`]
+    #[allow(dead_code)]
+    pub(crate) wal_start_lsn: Lsn,
+    /// End LSN of [`Self::wal`]
+    pub(crate) wal_end_lsn: Lsn,
+    /// End LSN of WAL available on the safekeeper.
+    ///
+    /// For pagservers this will be commit LSN,
+    /// while for the compute it will be the flush LSN.
+    pub(crate) available_wal_end_lsn: Lsn,
+}
+
+impl WalReaderStreamBuilder {
+    /// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
+    /// The stream terminates when the receiver (pageserver) is fully caught up
+    /// and there's no active computes.
+    pub(crate) async fn build(
+        self,
+        buffer_size: usize,
+    ) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
+        // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
+        // We can make the raw WAL sender use this stream too and remove the duplication.
+        let Self {
+            tli,
+            mut start_pos,
+            mut end_pos,
+            term,
+            mut end_watch,
+            wal_sender_guard,
+        } = self;
+        let mut wal_reader = tli.get_walreader(start_pos).await?;
+        let mut buffer = vec![0; buffer_size];
+
+        const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
+        Ok(try_stream! {
+            loop {
+                let have_something_to_send = end_pos > start_pos;
+
+                if !have_something_to_send {
+                    // wait for lsn
+                    let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
+                    match res {
+                        Ok(ok) => {
+                            end_pos = ok?;
+                        },
+                        Err(_) => {
+                            if let EndWatch::Commit(_) = end_watch {
+                                if let Some(remote_consistent_lsn) = wal_sender_guard
+                                    .walsenders()
+                                    .get_ws_remote_consistent_lsn(wal_sender_guard.id())
+                                {
+                                    if tli.should_walsender_stop(remote_consistent_lsn).await {
+                                        // Stop streaming if the receivers are caught up and
+                                        // there's no active compute. This causes the loop in
+                                        // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
+                                        // to exit and terminate the WAL stream.
+                                        return;
+                                    }
+                                }
+                            }
+
+                            continue;
+                        }
+                    }
+                }
+
+
+                assert!(
+                    end_pos > start_pos,
+                    "nothing to send after waiting for WAL"
+                );
+
+                // try to send as much as available, capped by the buffer size
+                let mut chunk_end_pos = start_pos + buffer_size as u64;
+                // if we went behind available WAL, back off
+                if chunk_end_pos >= end_pos {
+                    chunk_end_pos = end_pos;
+                } else {
+                    // If sending not up to end pos, round down to page boundary to
+                    // avoid breaking WAL record not at page boundary, as protocol
+                    // demands. See walsender.c (XLogSendPhysical).
+                    chunk_end_pos = chunk_end_pos
+                        .checked_sub(chunk_end_pos.block_offset())
+                        .unwrap();
+                }
+                let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
+                let buffer = &mut buffer[..send_size];
+                let send_size: usize;
+                {
+                    // If uncommitted part is being pulled, check that the term is
+                    // still the expected one.
+                    let _term_guard = if let Some(t) = term {
+                        Some(tli.acquire_term(t).await?)
+                    } else {
+                        None
+                    };
+                    // Read WAL into buffer. send_size can be additionally capped to
+                    // segment boundary here.
+                    send_size = wal_reader.read(buffer).await?
+                };
+                let wal = Bytes::copy_from_slice(&buffer[..send_size]);
+
+                yield WalBytes {
+                    wal,
+                    wal_start_lsn: start_pos,
+                    wal_end_lsn: start_pos + send_size as u64,
+                    available_wal_end_lsn: end_pos
+                };
+
+                start_pos += send_size as u64;
+            }
+        })
+    }
+}
diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 77e8f2cf17..e965aae5a0 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -15,16 +15,21 @@ from fixtures.neon_fixtures import (
 
 @pytest.mark.timeout(600)
 @pytest.mark.parametrize("shard_count", [1, 8, 32])
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
 def test_sharded_ingest(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     shard_count: int,
+    wal_receiver_protocol: str,
 ):
     """
     Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
     and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
     (shard_count=1) to the sharded case indicates the overhead of sharding.
     """
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
 
     ROW_COUNT = 100_000_000  # about 7 GB of WAL
 
@@ -50,7 +55,6 @@ def test_sharded_ingest(
     # Start the endpoint.
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
-
     # Ingest data and measure WAL volume and duration.
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
@@ -68,4 +72,48 @@ def test_sharded_ingest(
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
     zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
 
+    total_ingested = 0
+    total_records_received = 0
+    ingested_by_ps = []
+    for pageserver in env.pageservers:
+        ingested = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_bytes_received_total"
+        )
+        records_received = pageserver.http_client().get_metric_value(
+            "pageserver_wal_ingest_records_received_total"
+        )
+
+        if ingested is None:
+            ingested = 0
+
+        if records_received is None:
+            records_received = 0
+
+        ingested_by_ps.append(
+            (
+                pageserver.id,
+                {
+                    "ingested": ingested,
+                    "records_received": records_received,
+                },
+            )
+        )
+
+        total_ingested += int(ingested)
+        total_records_received += int(records_received)
+
+    total_ingested_mb = total_ingested / (1024 * 1024)
+    zenbenchmark.record("wal_ingested", total_ingested_mb, "MB", MetricReport.LOWER_IS_BETTER)
+    zenbenchmark.record(
+        "records_received", total_records_received, "records", MetricReport.LOWER_IS_BETTER
+    )
+
+    ingested_by_ps.sort(key=lambda x: x[0])
+    for _, stats in ingested_by_ps:
+        for k in stats:
+            if k != "records_received":
+                stats[k] /= 1024**2
+
+    log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
+
     assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index f71e05924a..79fd256304 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -27,7 +27,8 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 
 
 @skip_in_debug_build("only run with release build")
-def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
     a small number of rows and manually instructs the pageserver to run compaction
@@ -38,8 +39,8 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
 
     # Effectively disable the page cache to rely only on image layers
     # to shorten reads.
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=10
+    neon_env_builder.pageserver_config_override = f"""
+page_cache_size=10; wal_receiver_protocol='{wal_receiver_protocol}'
 """
 
     env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 23c6fa3a5a..70e71d99cd 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -19,7 +19,14 @@ from fixtures.neon_fixtures import NeonEnvBuilder
         "wal_record_crossing_segment_followed_by_small_one",
     ],
 )
-def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_crafted_wal_end(
+    neon_env_builder: NeonEnvBuilder, wal_type: str, wal_receiver_protocol: str
+):
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
+
     env = neon_env_builder.init_start()
     env.create_branch("test_crafted_wal_end")
     env.pageserver.allowed_errors.extend(
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 7a46f0140c..1d86c353be 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder, check_restored_datadir_content
 
 
 # Test subtransactions
@@ -9,8 +10,13 @@ from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 # maintained in the pageserver, so subtransactions are not very exciting for
 # Neon. They are included in the commit record though and updated in the
 # CLOG.
-def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
-    env = neon_simple_env
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol):
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
+
+    env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
 
     pg_conn = endpoint.connect()
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 18408b0619..094b10b576 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -622,8 +622,12 @@ async def run_segment_init_failure(env: NeonEnv):
 # Test (injected) failure during WAL segment init.
 # https://github.com/neondatabase/neon/issues/6401
 # https://github.com/neondatabase/neon/issues/6402
-def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+def test_segment_init_failure(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
     neon_env_builder.num_safekeepers = 1
+    neon_env_builder.pageserver_config_override = (
+        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
+    )
     env = neon_env_builder.init_start()
 
     asyncio.run(run_segment_init_failure(env))

From 87e4dd23a1d92ad99665b109ba58b7050a934b4d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 25 Nov 2024 18:53:26 +0100
Subject: [PATCH 07/29] proxy: Demote all cplane error replies to info log
 level (#9880)

## Problem

The vast majority of the error/warn logs from cplane are about time or
data transfer quotas exceeded or endpoint-not-found errors and not
operational errors in proxy or cplane.

## Summary of changes

* Demote cplane error replies to info level.
* Raise other errors from warn back to error.
---
 proxy/src/proxy/wake_compute.rs | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 8a672d48dc..4e9206feff 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,9 @@
-use tracing::{error, info, warn};
+use tracing::{error, info};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestContext;
-use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::errors::{ControlPlaneError, WakeComputeError};
 use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
 use crate::metrics::{
@@ -11,6 +11,18 @@ use crate::metrics::{
 };
 use crate::proxy::retry::{retry_after, should_retry};
 
+// Use macro to retain original callsite.
+macro_rules! log_wake_compute_error {
+    (error = ?$error:expr, $num_retries:expr, retriable = $retriable:literal) => {
+        match $error {
+            WakeComputeError::ControlPlane(ControlPlaneError::Message(_)) => {
+                info!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node")
+            }
+            _ => error!(error = ?$error, num_retries = $num_retries, retriable = $retriable, "couldn't wake compute node"),
+        }
+    };
+}
+
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &RequestContext,
@@ -20,7 +32,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     loop {
         match api.wake_compute(ctx).await {
             Err(e) if !should_retry(&e, *num_retries, config) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = false);
                 report_error(&e, false);
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
@@ -32,7 +44,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 return Err(e);
             }
             Err(e) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                log_wake_compute_error!(error = ?e, num_retries, retriable = true);
                 report_error(&e, true);
             }
             Ok(n) => {

From 7404887b810403768a950006335877aa6a966c8d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 25 Nov 2024 20:35:32 +0100
Subject: [PATCH 08/29] proxy: Demote errors from cplane request routines to
 debug (#9886)

## Problem

Any errors from these async blocks are unconditionally logged at error
level
even though we already handle such errors based on context.

## Summary of changes

* Log raw errors from creating and executing cplane requests at debug
level.
* Inline macro calls to retain the correct callsite.
---
 proxy/src/control_plane/client/mock.rs |  2 +-
 proxy/src/control_plane/client/neon.rs | 13 ++++++-------
 proxy/src/error.rs                     |  6 ------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 500acad50f..9537d717a1 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -114,7 +114,7 @@ impl MockControlPlane {
 
             Ok((secret, allowed_ips))
         }
-        .map_err(crate::error::log_error::<GetAuthInfoError>)
+        .inspect_err(|e: &GetAuthInfoError| tracing::error!("{e}"))
         .instrument(info_span!("postgres", url = self.endpoint.as_str()))
         .await?;
         Ok(AuthInfo {
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 757ea6720a..2cad981d01 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -134,8 +134,8 @@ impl NeonControlPlaneClient {
                 project_id: body.project_id,
             })
         }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
         .await
     }
 
@@ -193,8 +193,8 @@ impl NeonControlPlaneClient {
 
             Ok(rules)
         }
-        .map_err(crate::error::log_error)
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
         .await
     }
 
@@ -252,9 +252,8 @@ impl NeonControlPlaneClient {
 
             Ok(node)
         }
-        .map_err(crate::error::log_error)
-        // TODO: redo this span stuff
-        .instrument(info_span!("http", id = request_id))
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
         .await
     }
 }
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 7b693a7418..2221aac407 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -10,12 +10,6 @@ pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Err
     io::Error::new(io::ErrorKind::Other, e)
 }
 
-/// A small combinator for pluggable error logging.
-pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
-    tracing::error!("{e}");
-    e
-}
-
 /// Marks errors that may be safely shown to a client.
 /// This trait can be seen as a specialized version of [`ToString`].
 ///

From a74ab9338d85ae0fcac1fa965c1119a4d74c98df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 25 Nov 2024 21:23:42 +0100
Subject: [PATCH 09/29] fast_import: remove hardcoding of pg_version (#9878)

Before, we hardcoded the pg_version to 140000, while the code expected
version numbers like 14. Now we use an enum, and code from
`extension_server.rs` to auto-detect the correct version. The enum helps
when we add support for a version: enums ensure that compilation fails
if one forgets to put the version to one of the `match` locations.

cc https://github.com/neondatabase/neon/pull/9218
---
 compute_tools/src/bin/compute_ctl.rs  |  4 +-
 compute_tools/src/bin/fast_import.rs  |  9 ++++-
 compute_tools/src/extension_server.rs | 54 ++++++++++++++++++---------
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 4689cc2b83..6b670de2ea 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -58,7 +58,7 @@ use compute_tools::compute::{
     forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
 };
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::get_pg_version;
+use compute_tools::extension_server::get_pg_version_string;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -326,7 +326,7 @@ fn wait_spec(
         connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
-        pgversion: get_pg_version(pgbin),
+        pgversion: get_pg_version_string(pgbin),
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 3b0b990df2..6716cc6234 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -29,6 +29,7 @@ use anyhow::Context;
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
+use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;
@@ -131,11 +132,17 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     //  Initialize pgdata
     //
+    let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
+        PostgresMajorVersion::V14 => 14,
+        PostgresMajorVersion::V15 => 15,
+        PostgresMajorVersion::V16 => 16,
+        PostgresMajorVersion::V17 => 17,
+    };
     let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
     postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser,
         locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
-        pg_version: 140000, // XXX: this shouldn't be hard-coded but derived from which compute image we're running in
+        pg_version,
         initdb_bin: pg_bin_dir.join("initdb").as_ref(),
         library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
         pgdata: &pgdata_dir,
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index da2d107b54..f13b2308e7 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -103,14 +103,33 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
         .to_string()
 }
 
-pub fn get_pg_version(pgbin: &str) -> String {
+pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
     // pg_config --version returns a (platform specific) human readable string
     // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
     let human_version = get_pg_config("--version", pgbin);
-    parse_pg_version(&human_version).to_string()
+    parse_pg_version(&human_version)
 }
 
-fn parse_pg_version(human_version: &str) -> &str {
+pub fn get_pg_version_string(pgbin: &str) -> String {
+    match get_pg_version(pgbin) {
+        PostgresMajorVersion::V14 => "v14",
+        PostgresMajorVersion::V15 => "v15",
+        PostgresMajorVersion::V16 => "v16",
+        PostgresMajorVersion::V17 => "v17",
+    }
+    .to_owned()
+}
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum PostgresMajorVersion {
+    V14,
+    V15,
+    V16,
+    V17,
+}
+
+fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
+    use PostgresMajorVersion::*;
     // Normal releases have version strings like "PostgreSQL 15.4". But there
     // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
     // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -121,10 +140,10 @@ fn parse_pg_version(human_version: &str) -> &str {
         .captures(human_version)
     {
         Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return "v14",
-            "15" => return "v15",
-            "16" => return "v16",
-            "17" => return "v17",
+            "14" => return V14,
+            "15" => return V15,
+            "16" => return V16,
+            "17" => return V17,
             _ => {}
         },
         _ => {}
@@ -263,24 +282,25 @@ mod tests {
 
     #[test]
     fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        use super::PostgresMajorVersion::*;
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
         assert_eq!(
             parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
+            V15
         );
 
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
         assert_eq!(
             parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
+            V14
         );
 
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
     }
 
     #[test]

From 96a1b71c84965782bb10c9fb591ff3fe43b1f8c5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 25 Nov 2024 21:32:53 +0000
Subject: [PATCH 10/29] chore(proxy): discard request context span during
 passthrough (#9882)

## Problem

The RequestContext::span shouldn't live for the entire postgres
connection, only the handshake.

## Summary of changes

* Slight refactor to the RequestContext to discard the span upon
handshake completion.
* Make sure the temporary future for the handshake is dropped (not bound
to a variable)
* Runs our nightly fmt script
---
 proxy/src/cancellation.rs                 |  6 ++--
 proxy/src/console_redirect_proxy.rs       | 38 ++++++++++----------
 proxy/src/context/mod.rs                  | 23 +++++++++----
 proxy/src/proxy/mod.rs                    | 42 +++++++++++------------
 proxy/src/proxy/passthrough.rs            |  3 +-
 proxy/src/redis/cancellation_publisher.rs |  2 +-
 6 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 4b72a66e63..74415f1ffe 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,8 @@
-use std::net::SocketAddr;
+use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
 use dashmap::DashMap;
+use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use pq_proto::CancelKeyData;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -17,9 +18,6 @@ use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::cancellation_publisher::{
     CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
-use std::net::IpAddr;
-
-use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index fbd0c8e5c5..b910b524b1 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, Instrument};
@@ -88,40 +88,37 @@ pub async fn task_main(
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
-            let span = ctx.span();
 
-            let startup = Box::pin(
-                handle_client(
-                    config,
-                    backend,
-                    &ctx,
-                    cancellation_handler,
-                    socket,
-                    conn_gauge,
-                )
-                .instrument(span.clone()),
-            );
-            let res = startup.await;
+            let res = handle_client(
+                config,
+                backend,
+                &ctx,
+                cancellation_handler,
+                socket,
+                conn_gauge,
+            )
+            .instrument(ctx.span())
+            .boxed()
+            .await;
 
             match res {
                 Err(e) => {
-                    // todo: log and push to ctx the error kind
                     ctx.set_error_kind(e.get_error_kind());
-                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                    error!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
                 }
                 Ok(None) => {
                     ctx.set_success();
                 }
                 Ok(Some(p)) => {
                     ctx.set_success();
-                    ctx.log_connect();
-                    match p.proxy_pass().instrument(span.clone()).await {
+                    let _disconnect = ctx.log_connect();
+                    match p.proxy_pass().await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
                         }
                         Err(ErrorSource::Compute(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
                         }
                     }
                 }
@@ -219,6 +216,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
+        session_id: ctx.session_id(),
         _req: request_gauge,
         _conn: conn_gauge,
         _cancel: session,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 6d2d2d51ce..4ec04deb25 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -272,11 +272,14 @@ impl RequestContext {
         this.success = true;
     }
 
-    pub fn log_connect(&self) {
-        self.0
-            .try_lock()
-            .expect("should not deadlock")
-            .log_connect();
+    pub fn log_connect(self) -> DisconnectLogger {
+        let mut this = self.0.into_inner();
+        this.log_connect();
+
+        // close current span.
+        this.span = Span::none();
+
+        DisconnectLogger(this)
     }
 
     pub(crate) fn protocol(&self) -> Protocol {
@@ -434,8 +437,14 @@ impl Drop for RequestContextInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
-        } else {
-            self.log_disconnect();
         }
     }
 }
+
+pub struct DisconnectLogger(RequestContextInner);
+
+impl Drop for DisconnectLogger {
+    fn drop(&mut self) {
+        self.0.log_disconnect();
+    }
+}
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 5d9468d89a..7fe67e43de 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -10,7 +10,7 @@ pub(crate) mod wake_compute;
 use std::sync::Arc;
 
 pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
-use futures::TryFutureExt;
+use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
@@ -123,42 +123,39 @@ pub async fn task_main(
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
-            let span = ctx.span();
 
-            let startup = Box::pin(
-                handle_client(
-                    config,
-                    auth_backend,
-                    &ctx,
-                    cancellation_handler,
-                    socket,
-                    ClientMode::Tcp,
-                    endpoint_rate_limiter2,
-                    conn_gauge,
-                )
-                .instrument(span.clone()),
-            );
-            let res = startup.await;
+            let res = handle_client(
+                config,
+                auth_backend,
+                &ctx,
+                cancellation_handler,
+                socket,
+                ClientMode::Tcp,
+                endpoint_rate_limiter2,
+                conn_gauge,
+            )
+            .instrument(ctx.span())
+            .boxed()
+            .await;
 
             match res {
                 Err(e) => {
-                    // todo: log and push to ctx the error kind
                     ctx.set_error_kind(e.get_error_kind());
-                    warn!(parent: &span, "per-client task finished with an error: {e:#}");
+                    warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}");
                 }
                 Ok(None) => {
                     ctx.set_success();
                 }
                 Ok(Some(p)) => {
                     ctx.set_success();
-                    ctx.log_connect();
-                    match p.proxy_pass().instrument(span.clone()).await {
+                    let _disconnect = ctx.log_connect();
+                    match p.proxy_pass().await {
                         Ok(()) => {}
                         Err(ErrorSource::Client(e)) => {
-                            warn!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                            warn!(?session_id, "per-client task finished with an IO error from the client: {e:#}");
                         }
                         Err(ErrorSource::Compute(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                            error!(?session_id, "per-client task finished with an IO error from the compute: {e:#}");
                         }
                     }
                 }
@@ -352,6 +349,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         client: stream,
         aux: node.aux.clone(),
         compute: node,
+        session_id: ctx.session_id(),
         _req: request_gauge,
         _conn: conn_gauge,
         _cancel: session,
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 5e07c8eeae..dcaa81e5cd 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -59,6 +59,7 @@ pub(crate) struct ProxyPassthrough<P, S> {
     pub(crate) client: Stream<S>,
     pub(crate) compute: PostgresConnection,
     pub(crate) aux: MetricsAuxInfo,
+    pub(crate) session_id: uuid::Uuid,
 
     pub(crate) _req: NumConnectionRequestsGuard<'static>,
     pub(crate) _conn: NumClientConnectionsGuard<'static>,
@@ -69,7 +70,7 @@ impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
     pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
         let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
         if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
-            tracing::warn!(?err, "could not cancel the query in the database");
+            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
         }
         res
     }
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 633a2f1b81..228dbb7f64 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,6 +1,6 @@
+use core::net::IpAddr;
 use std::sync::Arc;
 
-use core::net::IpAddr;
 use pq_proto::CancelKeyData;
 use redis::AsyncCommands;
 use tokio::sync::Mutex;

From 13feda0669d65cbac4b2103952caba1a9db1342e Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 26 Nov 2024 12:46:58 +0100
Subject: [PATCH 11/29] track how much time the flush loop is stalled waiting
 for uploads (#9885)

## Problem

We don't know how much time PS is losing during ingest when waiting for
remote storage uploads in the flush frozen layer loop.
Also we don't know how many remote storage requests get an permit
without waiting (not throttled by remote_storage concurrency_limit).

## Summary of changes

- Add a metric that accumulates the time waited per shard/PS
- in [remote storage semaphore wait
seconds](https://neonprod.grafana.net/d/febd9732-9bcf-4992-a821-49b1f6b02724/remote-storage?orgId=1&var-datasource=HUNg6jvVk&var-instance=pageserver-26.us-east-2.aws.neon.build&var-instance=pageserver-27.us-east-2.aws.neon.build&var-instance=pageserver-28.us-east-2.aws.neon.build&var-instance=pageserver-29.us-east-2.aws.neon.build&var-instance=pageserver-30.us-east-2.aws.neon.build&var-instance=pageserver-31.us-east-2.aws.neon.build&var-instance=pageserver-36.us-east-2.aws.neon.build&var-instance=pageserver-37.us-east-2.aws.neon.build&var-instance=pageserver-38.us-east-2.aws.neon.build&var-instance=pageserver-39.us-east-2.aws.neon.build&var-instance=pageserver-40.us-east-2.aws.neon.build&var-instance=pageserver-41.us-east-2.aws.neon.build&var-request_type=put_object&from=1731961336340&to=1731964762933&viewPanel=3)
add a first bucket with 100 microseconds to count requests that do not
need to wait on semaphore

Update: created a new version that uses a Gauge (one increasing value
per PS/shard) instead of histogram as suggested by review
---
 libs/remote_storage/src/metrics.rs |  4 +++-
 pageserver/src/metrics.rs          | 25 ++++++++++++++++++++++++-
 pageserver/src/tenant/timeline.rs  |  5 ++++-
 test_runner/fixtures/metrics.py    |  1 +
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/libs/remote_storage/src/metrics.rs b/libs/remote_storage/src/metrics.rs
index f1aa4c433b..48c121fbc8 100644
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -176,7 +176,9 @@ pub(crate) struct BucketMetrics {
 
 impl Default for BucketMetrics {
     fn default() -> Self {
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
 
         let req_seconds = register_histogram_vec!(
             "remote_storage_s3_request_seconds",
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3cdc2a761e..5ce3ae6cf7 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
     register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
     register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
     register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -457,6 +457,15 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_flush_wait_upload_seconds",
+        "Time spent waiting for preceding uploads during layer flush",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_last_record_lsn",
@@ -2336,6 +2345,7 @@ pub(crate) struct TimelineMetrics {
     shard_id: String,
     timeline_id: String,
     pub flush_time_histo: StorageTimeMetrics,
+    pub flush_wait_upload_time_gauge: Gauge,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
@@ -2379,6 +2389,9 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
+        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let compact_time_histo = StorageTimeMetrics::new(
             StorageTimeOperation::Compact,
             &tenant_id,
@@ -2516,6 +2529,7 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
             flush_time_histo,
+            flush_wait_upload_time_gauge,
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
@@ -2563,6 +2577,14 @@ impl TimelineMetrics {
         self.resident_physical_size_gauge.get()
     }
 
+    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
+        self.flush_wait_upload_time_gauge.add(duration);
+        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
+            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
+            .unwrap()
+            .add(duration);
+    }
+
     pub(crate) fn shutdown(&self) {
         let was_shutdown = self
             .shutdown
@@ -2579,6 +2601,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f6a06e73a7..c1ff0f426d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3830,7 +3830,8 @@ impl Timeline {
         };
 
         // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote.
+        // This makes us refuse ingest until the new layers have been persisted to the remote
+        let start = Instant::now();
         self.remote_client
             .wait_completion()
             .await
@@ -3843,6 +3844,8 @@ impl Timeline {
                     FlushLayerError::Other(anyhow!(e).into())
                 }
             })?;
+        let duration = start.elapsed().as_secs_f64();
+        self.metrics.flush_wait_upload_time_gauge_add(duration);
 
         // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
         // a compaction can delete the file and then it won't be available for uploads any more.
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 330f007a77..3f90c233a6 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -168,6 +168,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    "pageserver_flush_wait_upload_seconds",
     counter("pageserver_tenant_throttling_count_accounted_start"),
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),

From 2b788cb53f53606b0e56df540b762e853f7bc41b Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 26 Nov 2024 11:49:37 -0600
Subject: [PATCH 12/29] Bump neon.logical_replication_max_snap_files default to
 10000 (#9896)

This bump comes from a recommendation from Chi.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/logical_replication_monitor.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c
index 1badbbed21..5eee5a1679 100644
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -20,7 +20,7 @@
 
 #define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
 
-static int	logical_replication_max_snap_files = 300;
+static int	logical_replication_max_snap_files = 10000;
 
 /*
  * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
@@ -184,7 +184,7 @@ InitLogicalReplicationMonitor(void)
 							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
 							NULL,
 							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
+							10000, -1, INT_MAX,
 							PGC_SIGHUP,
 							0,
 							NULL, NULL, NULL);

From 277c33ba3f47e88f7c032ce90d87b09df0c0e92c Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 27 Nov 2024 11:09:01 +0100
Subject: [PATCH 13/29] ingest benchmark: after effective_io_concurrency = 100
 we can increase compute side parallelism (#9904)

## Problem

ingest benchmark tests project migration to Neon involving steps
- COPY relation data
- create indexes
- create constraints

Previously we used only 4 copy jobs, 4 create index jobs and 7
maintenance workers. After increasing effective_io_concurrency on
compute we see that we can sustain more parallelism in the ingest bench

## Summary of changes

Increase copy jobs to 8, create index jobs to 8 and maintenance workers
to 16
---
 .../performance/test_perf_ingest_using_pgcopydb.py     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 2f4574ba88..37f2e9db50 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
         "--no-acl",
         "--skip-db-properties",
         "--table-jobs",
-        "4",
+        "8",
         "--index-jobs",
-        "4",
+        "8",
         "--restore-jobs",
-        "4",
+        "8",
         "--split-tables-larger-than",
-        "10GB",
+        "5GB",
         "--skip-extensions",
         "--use-copy-binary",
         "--filters",
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From 7b41ee872eff41f6a0d427e86f6cd3e9563c6fee Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 27 Nov 2024 10:42:26 +0000
Subject: [PATCH 14/29] CI(pre-merge-checks): build only one build-tools-image
 (#9718)

## Problem

The `pre-merge-checks` workflow relies on the build-tools image.
If changes to the `build-tools` image have been merged into the main
branch since the last CI run for a PR (with other changes to the
`build-tools`), the image will be rebuilt during the merge queue run.
Otherwise, cached images are used.
Rebuilding the image adds approximately 10 minutes on x86-64 and 20
minutes on arm64 to the process.

## Summary of changes
- parametrise `build-build-tools-image` job with arch and Debian version
- Run `pre-merge-checks` only on Debian 12 x86-64 image
---
 .github/workflows/build-build-tools-image.yml | 73 +++++++++++++------
 .github/workflows/pre-merge-checks.yml        |  9 ++-
 2 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 93da86a353..0a7f0cd7a0 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -2,6 +2,17 @@ name: Build build-tools image
 
 on:
   workflow_call:
+    inputs:
+      archs:
+        description: "Json array of architectures to build"
+        # Default values are set in `check-image` job, `set-variables` step
+        type: string
+        required: false
+      debians:
+        description: "Json array of Debian versions to build"
+        # Default values are set in `check-image` job, `set-variables` step
+        type: string
+        required: false
     outputs:
       image-tag:
         description: "build-tools tag"
@@ -32,25 +43,37 @@ jobs:
   check-image:
     runs-on: ubuntu-22.04
     outputs:
-      tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
-      found: ${{ steps.check-image.outputs.found }}
+      archs: ${{ steps.set-variables.outputs.archs }}
+      debians: ${{ steps.set-variables.outputs.debians }}
+      tag: ${{ steps.set-variables.outputs.image-tag }}
+      everything: ${{ steps.set-more-variables.outputs.everything }}
+      found: ${{ steps.set-more-variables.outputs.found }}
 
     steps:
       - uses: actions/checkout@v4
 
-      - name: Get build-tools image tag for the current commit
-        id: get-build-tools-tag
+      - name: Set variables
+        id: set-variables
         env:
+          ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
+          DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
           IMAGE_TAG: |
             ${{ hashFiles('build-tools.Dockerfile',
                           '.github/workflows/build-build-tools-image.yml') }}
         run: |
-          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
+          echo "archs=${ARCHS}"           | tee -a ${GITHUB_OUTPUT}
+          echo "debians=${DEBIANS}"       | tee -a ${GITHUB_OUTPUT}
+          echo "image-tag=${IMAGE_TAG}"   | tee -a ${GITHUB_OUTPUT}
 
-      - name: Check if such tag found in the registry
-        id: check-image
+      - name: Set more variables
+        id: set-more-variables
         env:
-          IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }}
+          IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }}
+          EVERYTHING: |
+            ${{ contains(fromJson(steps.set-variables.outputs.archs), 'x64') &&
+                contains(fromJson(steps.set-variables.outputs.archs), 'arm64') &&
+                contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') &&
+                contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }}
         run: |
           if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then
             found=true
@@ -58,8 +81,8 @@ jobs:
             found=false
           fi
 
-          echo "found=${found}" | tee -a $GITHUB_OUTPUT
-
+          echo "everything=${EVERYTHING}" | tee -a ${GITHUB_OUTPUT}
+          echo "found=${found}"           | tee -a ${GITHUB_OUTPUT}
 
   build-image:
     needs: [ check-image ]
@@ -67,8 +90,8 @@ jobs:
 
     strategy:
       matrix:
-        debian-version: [ bullseye, bookworm ]
-        arch: [ x64, arm64 ]
+        arch: ${{ fromJson(needs.check-image.outputs.archs) }}
+        debian: ${{ fromJson(needs.check-image.outputs.debians) }}
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
@@ -99,11 +122,11 @@ jobs:
           push: true
           pull: true
           build-args: |
-            DEBIAN_VERSION=${{ matrix.debian-version }}
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
+            DEBIAN_VERSION=${{ matrix.debian }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }}
           tags: |
-            neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}
+            neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }}
 
   merge-images:
     needs: [ check-image, build-image ]
@@ -118,15 +141,21 @@ jobs:
       - name: Create multi-arch image
         env:
           DEFAULT_DEBIAN_VERSION: bookworm
+          ARCHS: ${{ join(fromJson(needs.check-image.outputs.archs), ' ') }}
+          DEBIANS: ${{ join(fromJson(needs.check-image.outputs.debians), ' ') }}
+          EVERYTHING: ${{ needs.check-image.outputs.everything }}
           IMAGE_TAG: ${{ needs.check-image.outputs.tag }}
         run: |
-          for debian_version in bullseye bookworm; do
-            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
-            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+          for debian in ${DEBIANS}; do
+            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian}")
+
+            if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
               tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
             fi
 
-            docker buildx imagetools create "${tags[@]}" \
-                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
-                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
+            for arch in ${ARCHS}; do
+              tags+=("neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}")
+            done
+
+            docker buildx imagetools create "${tags[@]}"
           done
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index e1cec6d33d..d2f9d8a666 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -23,6 +23,8 @@ jobs:
         id: python-src
         with:
           files: |
+            .github/workflows/_check-codestyle-python.yml
+            .github/workflows/build-build-tools-image.yml
             .github/workflows/pre-merge-checks.yml
             **/**.py
             poetry.lock
@@ -38,6 +40,10 @@ jobs:
     if: needs.get-changed-files.outputs.python-changed == 'true'
     needs: [ get-changed-files ]
     uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      # Build only one combination to save time
+      archs: '["x64"]'
+      debians: '["bookworm"]'
     secrets: inherit
 
   check-codestyle-python:
@@ -45,7 +51,8 @@ jobs:
     needs: [ get-changed-files, build-build-tools-image ]
     uses: ./.github/workflows/_check-codestyle-python.yml
     with:
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      # `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
     secrets: inherit
 
   # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".

From 9e0148de11feefae7402bdc655ff6bf4ace8bc1f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 27 Nov 2024 12:12:21 +0000
Subject: [PATCH 15/29] safekeeper: use protobuf for sending compressed records
 to pageserver (#9821)

## Problem

https://github.com/neondatabase/neon/pull/9746 lifted decoding and
interpretation of WAL to the safekeeper.
This reduced the ingested amount on the pageservers by around 10x for a
tenant with 8 shards, but doubled
the ingested amount for single sharded tenants.

Also, https://github.com/neondatabase/neon/pull/9746 uses bincode which
doesn't support schema evolution.
Technically the schema can be evolved, but it's very cumbersome.

## Summary of changes

This patch set addresses both problems by adding protobuf support for
the interpreted wal records and adding compression support. Compressed
protobuf reduced the ingested amount by 100x on the 32 shards
`test_sharded_ingest` case (compared to non-interpreted proto). For the
1 shard case the reduction is 5x.

Sister change to `rust-postgres` is
[here](https://github.com/neondatabase/rust-postgres/pull/33).

## Links

Related: https://github.com/neondatabase/neon/issues/9336
Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |  14 +-
 libs/pageserver_api/src/key.rs                |  12 +
 libs/pq_proto/src/lib.rs                      |   4 -
 libs/utils/src/postgres_client.rs             |  54 ++-
 libs/wal_decoder/Cargo.toml                   |   8 +
 libs/wal_decoder/build.rs                     |  11 +
 libs/wal_decoder/proto/interpreted_wal.proto  |  43 +++
 libs/wal_decoder/src/lib.rs                   |   1 +
 libs/wal_decoder/src/models.rs                |  20 +
 libs/wal_decoder/src/wire_format.rs           | 356 ++++++++++++++++++
 .../walreceiver/connection_manager.rs         |   4 +-
 .../walreceiver/walreceiver_connection.rs     |  52 ++-
 safekeeper/src/handler.rs                     |  17 +-
 safekeeper/src/send_interpreted_wal.rs        |  53 ++-
 safekeeper/src/send_wal.rs                    |   9 +-
 test_runner/fixtures/neon_fixtures.py         |  36 ++
 .../performance/test_sharded_ingest.py        |  47 ++-
 test_runner/regress/test_compaction.py        |  16 +-
 test_runner/regress/test_crafted_wal_end.py   |  15 +-
 test_runner/regress/test_subxacts.py          |  15 +-
 .../regress/test_wal_acceptor_async.py        |  21 +-
 21 files changed, 702 insertions(+), 106 deletions(-)
 create mode 100644 libs/wal_decoder/build.rs
 create mode 100644 libs/wal_decoder/proto/interpreted_wal.proto
 create mode 100644 libs/wal_decoder/src/wire_format.rs

diff --git a/Cargo.lock b/Cargo.lock
index c1a14210de..43a46fb1eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4133,7 +4133,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4146,7 +4146,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4165,7 +4165,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6468,7 +6468,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#2a2a7c56930dd5ad60676ce6da92e1cbe6fb3ef5"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -7120,10 +7120,16 @@ name = "wal_decoder"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "async-compression",
  "bytes",
  "pageserver_api",
  "postgres_ffi",
+ "prost",
  "serde",
+ "thiserror",
+ "tokio",
+ "tonic",
+ "tonic-build",
  "tracing",
  "utils",
  "workspace_hack",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 4505101ea6..523d143381 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -229,6 +229,18 @@ impl Key {
     }
 }
 
+impl CompactKey {
+    pub fn raw(&self) -> i128 {
+        self.0
+    }
+}
+
+impl From<i128> for CompactKey {
+    fn from(value: i128) -> Self {
+        Self(value)
+    }
+}
+
 impl fmt::Display for Key {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index b7871ab01f..4b0331999d 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -688,9 +688,6 @@ pub struct InterpretedWalRecordsBody<'a> {
     pub streaming_lsn: u64,
     /// Current end of WAL on the server
     pub commit_lsn: u64,
-    /// Start LSN of the next record in PG WAL.
-    /// Is 0 if the portion of PG WAL did not contain any records.
-    pub next_record_lsn: u64,
     pub data: &'a [u8],
 }
 
@@ -1028,7 +1025,6 @@ impl BeMessage<'_> {
                                       // dependency
                     buf.put_u64(rec.streaming_lsn);
                     buf.put_u64(rec.commit_lsn);
-                    buf.put_u64(rec.next_record_lsn);
                     buf.put_slice(rec.data);
                 });
             }
diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs
index 3073bbde4c..a62568202b 100644
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -7,40 +7,31 @@ use postgres_connection::{parse_host_port, PgConnectionConfig};
 
 use crate::id::TenantTimelineId;
 
-/// Postgres client protocol types
-#[derive(
-    Copy,
-    Clone,
-    PartialEq,
-    Eq,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-    Debug,
-)]
-#[strum(serialize_all = "kebab-case")]
-#[repr(u8)]
+#[derive(Copy, Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum InterpretedFormat {
+    Bincode,
+    Protobuf,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum Compression {
+    Zstd { level: i8 },
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "type", content = "args")]
+#[serde(rename_all = "kebab-case")]
 pub enum PostgresClientProtocol {
     /// Usual Postgres replication protocol
     Vanilla,
     /// Custom shard-aware protocol that replicates interpreted records.
     /// Used to send wal from safekeeper to pageserver.
-    Interpreted,
-}
-
-impl TryFrom<u8> for PostgresClientProtocol {
-    type Error = u8;
-
-    fn try_from(value: u8) -> Result<Self, Self::Error> {
-        Ok(match value {
-            v if v == (PostgresClientProtocol::Vanilla as u8) => PostgresClientProtocol::Vanilla,
-            v if v == (PostgresClientProtocol::Interpreted as u8) => {
-                PostgresClientProtocol::Interpreted
-            }
-            x => return Err(x),
-        })
-    }
+    Interpreted {
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    },
 }
 
 pub struct ConnectionConfigArgs<'a> {
@@ -63,7 +54,10 @@ impl<'a> ConnectionConfigArgs<'a> {
             "-c".to_owned(),
             format!("timeline_id={}", self.ttid.timeline_id),
             format!("tenant_id={}", self.ttid.tenant_id),
-            format!("protocol={}", self.protocol as u8),
+            format!(
+                "protocol={}",
+                serde_json::to_string(&self.protocol).unwrap()
+            ),
         ];
 
         if self.shard_number.is_some() {
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index c8c0f4c990..8fac4e38ca 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -8,11 +8,19 @@ license.workspace = true
 testing = ["pageserver_api/testing"]
 
 [dependencies]
+async-compression.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
 pageserver_api.workspace = true
+prost.workspace = true
 postgres_ffi.workspace = true
 serde.workspace = true
+thiserror.workspace = true
+tokio = { workspace = true, features = ["io-util"] }
+tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[build-dependencies]
+tonic-build.workspace = true
diff --git a/libs/wal_decoder/build.rs b/libs/wal_decoder/build.rs
new file mode 100644
index 0000000000..d5b7ad02ad
--- /dev/null
+++ b/libs/wal_decoder/build.rs
@@ -0,0 +1,11 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Generate rust code from .proto protobuf.
+    //
+    // Note: we previously tried to use deterministic location at proto/ for
+    // easy location, but apparently interference with cachepot sometimes fails
+    // the build then. Anyway, per cargo docs build script shouldn't output to
+    // anywhere but $OUT_DIR.
+    tonic_build::compile_protos("proto/interpreted_wal.proto")
+        .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
+    Ok(())
+}
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
new file mode 100644
index 0000000000..0393392c1a
--- /dev/null
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -0,0 +1,43 @@
+syntax = "proto3";
+
+package interpreted_wal;
+
+message InterpretedWalRecords {
+  repeated InterpretedWalRecord records = 1;
+  optional uint64 next_record_lsn = 2;
+}
+
+message InterpretedWalRecord {
+  optional bytes metadata_record = 1;
+  SerializedValueBatch batch = 2;
+  uint64 next_record_lsn = 3;
+  bool flush_uncommitted = 4;
+  uint32 xid = 5;
+}
+
+message SerializedValueBatch {
+  bytes raw = 1;
+  repeated ValueMeta metadata = 2;
+  uint64 max_lsn = 3;
+  uint64 len = 4;
+}
+
+enum ValueMetaType {
+  Serialized = 0;
+  Observed = 1;
+}
+
+message ValueMeta {
+  ValueMetaType type = 1;
+  CompactKey key = 2;
+  uint64 lsn = 3;
+  optional uint64 batch_offset = 4;
+  optional uint64 len = 5;
+  optional bool will_init = 6;
+}
+
+message CompactKey {
+  int64 high = 1;
+  int64 low = 2;
+}
+
diff --git a/libs/wal_decoder/src/lib.rs b/libs/wal_decoder/src/lib.rs
index a8a26956e6..96b717021f 100644
--- a/libs/wal_decoder/src/lib.rs
+++ b/libs/wal_decoder/src/lib.rs
@@ -1,3 +1,4 @@
 pub mod decoder;
 pub mod models;
 pub mod serialized_batch;
+pub mod wire_format;
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 7ac425cb5f..af22de5d95 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -37,12 +37,32 @@ use utils::lsn::Lsn;
 
 use crate::serialized_batch::SerializedValueBatch;
 
+// Code generated by protobuf.
+pub mod proto {
+    // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]`
+    // we don't use these types for anything but broker data transmission,
+    // so it's ok to ignore this one.
+    #![allow(clippy::derive_partial_eq_without_eq)]
+    // The generated ValueMeta has a `len` method generate for its `len` field.
+    #![allow(clippy::len_without_is_empty)]
+    tonic::include_proto!("interpreted_wal");
+}
+
 #[derive(Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
     Yes,
     No,
 }
 
+/// A batch of interpreted WAL records
+#[derive(Serialize, Deserialize)]
+pub struct InterpretedWalRecords {
+    pub records: Vec<InterpretedWalRecord>,
+    // Start LSN of the next record after the batch.
+    // Note that said record may not belong to the current shard.
+    pub next_record_lsn: Option<Lsn>,
+}
+
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
 #[derive(Serialize, Deserialize)]
 pub struct InterpretedWalRecord {
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
new file mode 100644
index 0000000000..5a343054c3
--- /dev/null
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -0,0 +1,356 @@
+use bytes::{BufMut, Bytes, BytesMut};
+use pageserver_api::key::CompactKey;
+use prost::{DecodeError, EncodeError, Message};
+use tokio::io::AsyncWriteExt;
+use utils::bin_ser::{BeSer, DeserializeError, SerializeError};
+use utils::lsn::Lsn;
+use utils::postgres_client::{Compression, InterpretedFormat};
+
+use crate::models::{
+    FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords, MetadataRecord,
+};
+
+use crate::serialized_batch::{
+    ObservedValueMeta, SerializedValueBatch, SerializedValueMeta, ValueMeta,
+};
+
+use crate::models::proto;
+
+#[derive(Debug, thiserror::Error)]
+pub enum ToWireFormatError {
+    #[error("{0}")]
+    Bincode(#[from] SerializeError),
+    #[error("{0}")]
+    Protobuf(#[from] ProtobufSerializeError),
+    #[error("{0}")]
+    Compression(#[from] std::io::Error),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ProtobufSerializeError {
+    #[error("{0}")]
+    MetadataRecord(#[from] SerializeError),
+    #[error("{0}")]
+    Encode(#[from] EncodeError),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum FromWireFormatError {
+    #[error("{0}")]
+    Bincode(#[from] DeserializeError),
+    #[error("{0}")]
+    Protobuf(#[from] ProtobufDeserializeError),
+    #[error("{0}")]
+    Decompress(#[from] std::io::Error),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ProtobufDeserializeError {
+    #[error("{0}")]
+    Transcode(#[from] TranscodeError),
+    #[error("{0}")]
+    Decode(#[from] DecodeError),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum TranscodeError {
+    #[error("{0}")]
+    BadInput(String),
+    #[error("{0}")]
+    MetadataRecord(#[from] DeserializeError),
+}
+
+pub trait ToWireFormat {
+    fn to_wire(
+        self,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> impl std::future::Future<Output = Result<Bytes, ToWireFormatError>> + Send;
+}
+
+pub trait FromWireFormat {
+    type T;
+    fn from_wire(
+        buf: &Bytes,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> impl std::future::Future<Output = Result<Self::T, FromWireFormatError>> + Send;
+}
+
+impl ToWireFormat for InterpretedWalRecords {
+    async fn to_wire(
+        self,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> Result<Bytes, ToWireFormatError> {
+        use async_compression::tokio::write::ZstdEncoder;
+        use async_compression::Level;
+
+        let encode_res: Result<Bytes, ToWireFormatError> = match format {
+            InterpretedFormat::Bincode => {
+                let buf = BytesMut::new();
+                let mut buf = buf.writer();
+                self.ser_into(&mut buf)?;
+                Ok(buf.into_inner().freeze())
+            }
+            InterpretedFormat::Protobuf => {
+                let proto: proto::InterpretedWalRecords = self.try_into()?;
+                let mut buf = BytesMut::new();
+                proto
+                    .encode(&mut buf)
+                    .map_err(|e| ToWireFormatError::Protobuf(e.into()))?;
+
+                Ok(buf.freeze())
+            }
+        };
+
+        let buf = encode_res?;
+        let compressed_buf = match compression {
+            Some(Compression::Zstd { level }) => {
+                let mut encoder = ZstdEncoder::with_quality(
+                    Vec::with_capacity(buf.len() / 4),
+                    Level::Precise(level as i32),
+                );
+                encoder.write_all(&buf).await?;
+                encoder.shutdown().await?;
+                Bytes::from(encoder.into_inner())
+            }
+            None => buf,
+        };
+
+        Ok(compressed_buf)
+    }
+}
+
+impl FromWireFormat for InterpretedWalRecords {
+    type T = Self;
+
+    async fn from_wire(
+        buf: &Bytes,
+        format: InterpretedFormat,
+        compression: Option<Compression>,
+    ) -> Result<Self, FromWireFormatError> {
+        let decompressed_buf = match compression {
+            Some(Compression::Zstd { .. }) => {
+                use async_compression::tokio::write::ZstdDecoder;
+                let mut decoded_buf = Vec::with_capacity(buf.len());
+                let mut decoder = ZstdDecoder::new(&mut decoded_buf);
+                decoder.write_all(buf).await?;
+                decoder.flush().await?;
+                Bytes::from(decoded_buf)
+            }
+            None => buf.clone(),
+        };
+
+        match format {
+            InterpretedFormat::Bincode => {
+                InterpretedWalRecords::des(&decompressed_buf).map_err(FromWireFormatError::Bincode)
+            }
+            InterpretedFormat::Protobuf => {
+                let proto = proto::InterpretedWalRecords::decode(decompressed_buf)
+                    .map_err(|e| FromWireFormatError::Protobuf(e.into()))?;
+                InterpretedWalRecords::try_from(proto)
+                    .map_err(|e| FromWireFormatError::Protobuf(e.into()))
+            }
+        }
+    }
+}
+
+impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
+    type Error = SerializeError;
+
+    fn try_from(value: InterpretedWalRecords) -> Result<Self, Self::Error> {
+        let records = value
+            .records
+            .into_iter()
+            .map(proto::InterpretedWalRecord::try_from)
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(proto::InterpretedWalRecords {
+            records,
+            next_record_lsn: value.next_record_lsn.map(|l| l.0),
+        })
+    }
+}
+
+impl TryFrom<InterpretedWalRecord> for proto::InterpretedWalRecord {
+    type Error = SerializeError;
+
+    fn try_from(value: InterpretedWalRecord) -> Result<Self, Self::Error> {
+        let metadata_record = value
+            .metadata_record
+            .map(|meta_rec| -> Result<Vec<u8>, Self::Error> {
+                let mut buf = Vec::new();
+                meta_rec.ser_into(&mut buf)?;
+                Ok(buf)
+            })
+            .transpose()?;
+
+        Ok(proto::InterpretedWalRecord {
+            metadata_record,
+            batch: Some(proto::SerializedValueBatch::from(value.batch)),
+            next_record_lsn: value.next_record_lsn.0,
+            flush_uncommitted: matches!(value.flush_uncommitted, FlushUncommittedRecords::Yes),
+            xid: value.xid,
+        })
+    }
+}
+
+impl From<SerializedValueBatch> for proto::SerializedValueBatch {
+    fn from(value: SerializedValueBatch) -> Self {
+        proto::SerializedValueBatch {
+            raw: value.raw,
+            metadata: value
+                .metadata
+                .into_iter()
+                .map(proto::ValueMeta::from)
+                .collect(),
+            max_lsn: value.max_lsn.0,
+            len: value.len as u64,
+        }
+    }
+}
+
+impl From<ValueMeta> for proto::ValueMeta {
+    fn from(value: ValueMeta) -> Self {
+        match value {
+            ValueMeta::Observed(obs) => proto::ValueMeta {
+                r#type: proto::ValueMetaType::Observed.into(),
+                key: Some(proto::CompactKey::from(obs.key)),
+                lsn: obs.lsn.0,
+                batch_offset: None,
+                len: None,
+                will_init: None,
+            },
+            ValueMeta::Serialized(ser) => proto::ValueMeta {
+                r#type: proto::ValueMetaType::Serialized.into(),
+                key: Some(proto::CompactKey::from(ser.key)),
+                lsn: ser.lsn.0,
+                batch_offset: Some(ser.batch_offset),
+                len: Some(ser.len as u64),
+                will_init: Some(ser.will_init),
+            },
+        }
+    }
+}
+
+impl From<CompactKey> for proto::CompactKey {
+    fn from(value: CompactKey) -> Self {
+        proto::CompactKey {
+            high: (value.raw() >> 64) as i64,
+            low: value.raw() as i64,
+        }
+    }
+}
+
+impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::InterpretedWalRecords) -> Result<Self, Self::Error> {
+        let records = value
+            .records
+            .into_iter()
+            .map(InterpretedWalRecord::try_from)
+            .collect::<Result<_, _>>()?;
+
+        Ok(InterpretedWalRecords {
+            records,
+            next_record_lsn: value.next_record_lsn.map(Lsn::from),
+        })
+    }
+}
+
+impl TryFrom<proto::InterpretedWalRecord> for InterpretedWalRecord {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::InterpretedWalRecord) -> Result<Self, Self::Error> {
+        let metadata_record = value
+            .metadata_record
+            .map(|mrec| -> Result<_, DeserializeError> { MetadataRecord::des(&mrec) })
+            .transpose()?;
+
+        let batch = {
+            let batch = value.batch.ok_or_else(|| {
+                TranscodeError::BadInput("InterpretedWalRecord::batch missing".to_string())
+            })?;
+
+            SerializedValueBatch::try_from(batch)?
+        };
+
+        Ok(InterpretedWalRecord {
+            metadata_record,
+            batch,
+            next_record_lsn: Lsn(value.next_record_lsn),
+            flush_uncommitted: if value.flush_uncommitted {
+                FlushUncommittedRecords::Yes
+            } else {
+                FlushUncommittedRecords::No
+            },
+            xid: value.xid,
+        })
+    }
+}
+
+impl TryFrom<proto::SerializedValueBatch> for SerializedValueBatch {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::SerializedValueBatch) -> Result<Self, Self::Error> {
+        let metadata = value
+            .metadata
+            .into_iter()
+            .map(ValueMeta::try_from)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        Ok(SerializedValueBatch {
+            raw: value.raw,
+            metadata,
+            max_lsn: Lsn(value.max_lsn),
+            len: value.len as usize,
+        })
+    }
+}
+
+impl TryFrom<proto::ValueMeta> for ValueMeta {
+    type Error = TranscodeError;
+
+    fn try_from(value: proto::ValueMeta) -> Result<Self, Self::Error> {
+        match proto::ValueMetaType::try_from(value.r#type) {
+            Ok(proto::ValueMetaType::Serialized) => {
+                Ok(ValueMeta::Serialized(SerializedValueMeta {
+                    key: value
+                        .key
+                        .ok_or_else(|| {
+                            TranscodeError::BadInput("ValueMeta::key missing".to_string())
+                        })?
+                        .into(),
+                    lsn: Lsn(value.lsn),
+                    batch_offset: value.batch_offset.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::batch_offset missing".to_string())
+                    })?,
+                    len: value.len.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::len missing".to_string())
+                    })? as usize,
+                    will_init: value.will_init.ok_or_else(|| {
+                        TranscodeError::BadInput("ValueMeta::will_init missing".to_string())
+                    })?,
+                }))
+            }
+            Ok(proto::ValueMetaType::Observed) => Ok(ValueMeta::Observed(ObservedValueMeta {
+                key: value
+                    .key
+                    .ok_or_else(|| TranscodeError::BadInput("ValueMeta::key missing".to_string()))?
+                    .into(),
+                lsn: Lsn(value.lsn),
+            })),
+            Err(_) => Err(TranscodeError::BadInput(format!(
+                "Unexpected ValueMeta::type {}",
+                value.r#type
+            ))),
+        }
+    }
+}
+
+impl From<proto::CompactKey> for CompactKey {
+    fn from(value: proto::CompactKey) -> Self {
+        (((value.high as i128) << 64) | (value.low as i128)).into()
+    }
+}
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7a64703a30..583d6309ab 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -535,6 +535,7 @@ impl ConnectionManagerState {
         let node_id = new_sk.safekeeper_id;
         let connect_timeout = self.conf.wal_connect_timeout;
         let ingest_batch_size = self.conf.ingest_batch_size;
+        let protocol = self.conf.protocol;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -548,6 +549,7 @@ impl ConnectionManagerState {
 
                 let res = super::walreceiver_connection::handle_walreceiver_connection(
                     timeline,
+                    protocol,
                     new_sk.wal_source_connconf,
                     events_sender,
                     cancellation.clone(),
@@ -991,7 +993,7 @@ impl ConnectionManagerState {
                     PostgresClientProtocol::Vanilla => {
                         (None, None, None)
                     },
-                    PostgresClientProtocol::Interpreted => {
+                    PostgresClientProtocol::Interpreted { .. } => {
                         let shard_identity = self.timeline.get_shard_identity();
                         (
                             Some(shard_identity.number.0),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1a0e66ceb3..31cf1b6307 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,10 @@ use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
-use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
+use wal_decoder::{
+    models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords},
+    wire_format::FromWireFormat,
+};
 
 use super::TaskStateUpdate;
 use crate::{
@@ -36,7 +39,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use utils::{bin_ser::BeSer, id::NodeId, lsn::Lsn};
+use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
 /// Status of the connection.
@@ -109,6 +112,7 @@ impl From<WalDecodeError> for WalReceiverError {
 #[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
     timeline: Arc<Timeline>,
+    protocol: PostgresClientProtocol,
     wal_source_connconf: PgConnectionConfig,
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
@@ -260,6 +264,14 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
+    let interpreted_proto_config = match protocol {
+        PostgresClientProtocol::Vanilla => None,
+        PostgresClientProtocol::Interpreted {
+            format,
+            compression,
+        } => Some((format, compression)),
+    };
+
     while let Some(replication_message) = {
         select! {
             _ = cancellation.cancelled() => {
@@ -332,16 +344,26 @@ pub(super) async fn handle_walreceiver_connection(
                 // This is the end LSN of the raw WAL from which the records
                 // were interpreted.
                 let streaming_lsn = Lsn::from(raw.streaming_lsn());
-                tracing::debug!(
-                    "Received WAL up to {streaming_lsn} with next_record_lsn={}",
-                    Lsn(raw.next_record_lsn().unwrap_or(0))
-                );
 
-                let records = Vec::<InterpretedWalRecord>::des(raw.data()).with_context(|| {
-                    anyhow::anyhow!(
+                let (format, compression) = interpreted_proto_config.unwrap();
+                let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
+                    .await
+                    .with_context(|| {
+                        anyhow::anyhow!(
                         "Failed to deserialize interpreted records ending at LSN {streaming_lsn}"
                     )
-                })?;
+                    })?;
+
+                let InterpretedWalRecords {
+                    records,
+                    next_record_lsn,
+                } = batch;
+
+                tracing::debug!(
+                    "Received WAL up to {} with next_record_lsn={:?}",
+                    streaming_lsn,
+                    next_record_lsn
+                );
 
                 // We start the modification at 0 because each interpreted record
                 // advances it to its end LSN. 0 is just an initialization placeholder.
@@ -360,14 +382,18 @@ pub(super) async fn handle_walreceiver_connection(
                         .await?;
                     }
 
-                    let next_record_lsn = interpreted.next_record_lsn;
+                    let local_next_record_lsn = interpreted.next_record_lsn;
                     let ingested = walingest
                         .ingest_record(interpreted, &mut modification, &ctx)
                         .await
-                        .with_context(|| format!("could not ingest record at {next_record_lsn}"))?;
+                        .with_context(|| {
+                            format!("could not ingest record at {local_next_record_lsn}")
+                        })?;
 
                     if !ingested {
-                        tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
+                        tracing::debug!(
+                            "ingest: filtered out record @ LSN {local_next_record_lsn}"
+                        );
                         WAL_INGEST.records_filtered.inc();
                         filtered_records += 1;
                     }
@@ -399,7 +425,7 @@ pub(super) async fn handle_walreceiver_connection(
                 // need to advance last record LSN on all shards. If we've not ingested the latest
                 // record, then set the LSN of the modification past it. This way all shards
                 // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match raw.next_record_lsn().map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
                     Some(lsn) if lsn > modification.get_lsn() => {
                         modification.set_lsn(lsn).unwrap();
                         true
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index cec7c3c7ee..22f33b17e0 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -123,17 +123,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064
                     match opt.split_once('=') {
                         Some(("protocol", value)) => {
-                            let raw_value = value
-                                .parse::<u8>()
-                                .with_context(|| format!("Failed to parse {value} as protocol"))?;
-
-                            self.protocol = Some(
-                                PostgresClientProtocol::try_from(raw_value).map_err(|_| {
-                                    QueryError::Other(anyhow::anyhow!(
-                                        "Unexpected client protocol type: {raw_value}"
-                                    ))
-                                })?,
-                            );
+                            self.protocol =
+                                Some(serde_json::from_str(value).with_context(|| {
+                                    format!("Failed to parse {value} as protocol")
+                                })?);
                         }
                         Some(("ztenantid", value)) | Some(("tenant_id", value)) => {
                             self.tenant_id = Some(value.parse().with_context(|| {
@@ -180,7 +173,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                             )));
                         }
                     }
-                    PostgresClientProtocol::Interpreted => {
+                    PostgresClientProtocol::Interpreted { .. } => {
                         match (shard_count, shard_number, shard_stripe_size) {
                             (Some(count), Some(number), Some(stripe_size)) => {
                                 let params = ShardParameters {
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index cf0ee276e9..2589030422 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -9,9 +9,11 @@ use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
 use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::time::MissedTickBehavior;
-use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
-use wal_decoder::models::InterpretedWalRecord;
+use utils::postgres_client::Compression;
+use utils::postgres_client::InterpretedFormat;
+use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
+use wal_decoder::wire_format::ToWireFormat;
 
 use crate::send_wal::EndWatchView;
 use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
@@ -20,6 +22,8 @@ use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
 /// This is used for sending WAL to the pageserver. Said WAL
 /// is pre-interpreted and filtered for the shard.
 pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) format: InterpretedFormat,
+    pub(crate) compression: Option<Compression>,
     pub(crate) pgb: &'a mut PostgresBackend<IO>,
     pub(crate) wal_stream_builder: WalReaderStreamBuilder,
     pub(crate) end_watch_view: EndWatchView,
@@ -28,6 +32,12 @@ pub(crate) struct InterpretedWalSender<'a, IO> {
     pub(crate) appname: Option<String>,
 }
 
+struct Batch {
+    wal_end_lsn: Lsn,
+    available_wal_end_lsn: Lsn,
+    records: InterpretedWalRecords,
+}
+
 impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
     /// Send interpreted WAL to a receiver.
     /// Stops when an error occurs or the receiver is caught up and there's no active compute.
@@ -46,10 +56,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
         keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
         keepalive_ticker.reset();
 
+        let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
+
         loop {
             tokio::select! {
-                // Get some WAL from the stream and then: decode, interpret and send it
-                wal = stream.next() => {
+                // Get some WAL from the stream and then: decode, interpret and push it down the
+                // pipeline.
+                wal = stream.next(), if tx.capacity() > 0 => {
                     let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
                         Some(some) => some?,
                         None => { break; }
@@ -81,10 +94,26 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         }
                     }
 
-                    let mut buf = Vec::new();
-                    records
-                        .ser_into(&mut buf)
-                        .with_context(|| "Failed to serialize interpreted WAL")?;
+                    let batch = InterpretedWalRecords {
+                        records,
+                        next_record_lsn: max_next_record_lsn
+                    };
+
+                    tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
+                },
+                // For a previously interpreted batch, serialize it and push it down the wire.
+                batch = rx.recv() => {
+                    let batch = match batch {
+                        Some(b) => b,
+                        None => { break; }
+                    };
+
+                    let buf = batch
+                        .records
+                        .to_wire(self.format, self.compression)
+                        .await
+                        .with_context(|| "Failed to serialize interpreted WAL")
+                        .map_err(CopyStreamHandlerEnd::from)?;
 
                     // Reset the keep alive ticker since we are sending something
                     // over the wire now.
@@ -92,13 +121,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
 
                     self.pgb
                         .write_message(&BeMessage::InterpretedWalRecords(InterpretedWalRecordsBody {
-                            streaming_lsn: wal_end_lsn.0,
-                            commit_lsn: available_wal_end_lsn.0,
-                            next_record_lsn: max_next_record_lsn.unwrap_or(Lsn::INVALID).0,
-                            data: buf.as_slice(),
+                            streaming_lsn: batch.wal_end_lsn.0,
+                            commit_lsn: batch.available_wal_end_lsn.0,
+                            data: &buf,
                         })).await?;
                 }
-
                 // Send a periodic keep alive when the connection has been idle for a while.
                 _ = keepalive_ticker.tick() => {
                     self.pgb
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 1acfcad418..225b7f4c05 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -454,7 +454,7 @@ impl SafekeeperPostgresHandler {
         }
 
         info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}, protocol={:?}",
             start_pos,
             end_pos,
             matches!(end_watch, EndWatch::Flush(_)),
@@ -489,7 +489,10 @@ impl SafekeeperPostgresHandler {
 
                 Either::Left(sender.run())
             }
-            PostgresClientProtocol::Interpreted => {
+            PostgresClientProtocol::Interpreted {
+                format,
+                compression,
+            } => {
                 let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
                 let end_watch_view = end_watch.view();
                 let wal_stream_builder = WalReaderStreamBuilder {
@@ -502,6 +505,8 @@ impl SafekeeperPostgresHandler {
                 };
 
                 let sender = InterpretedWalSender {
+                    format,
+                    compression,
                     pgb,
                     wal_stream_builder,
                     end_watch_view,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 07d442b4a6..a45a311dc2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -310,6 +310,31 @@ class PgProtocol:
         return self.safe_psql(query, log_query=log_query)[0][0]
 
 
+class PageserverWalReceiverProtocol(StrEnum):
+    VANILLA = "vanilla"
+    INTERPRETED = "interpreted"
+
+    @staticmethod
+    def to_config_key_value(proto) -> tuple[str, dict[str, Any]]:
+        if proto == PageserverWalReceiverProtocol.VANILLA:
+            return (
+                "wal_receiver_protocol",
+                {
+                    "type": "vanilla",
+                },
+            )
+        elif proto == PageserverWalReceiverProtocol.INTERPRETED:
+            return (
+                "wal_receiver_protocol",
+                {
+                    "type": "interpreted",
+                    "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
+                },
+            )
+        else:
+            raise ValueError(f"Unknown protocol type: {proto}")
+
+
 class NeonEnvBuilder:
     """
     Builder object to create a Neon runtime environment
@@ -356,6 +381,7 @@ class NeonEnvBuilder:
         safekeeper_extra_opts: list[str] | None = None,
         storage_controller_port_override: int | None = None,
         pageserver_virtual_file_io_mode: str | None = None,
+        pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -409,6 +435,8 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
 
+        self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -1023,6 +1051,7 @@ class NeonEnv:
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
+        self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: dict[str, Any] = {
@@ -1092,6 +1121,13 @@ class NeonEnv:
             if self.pageserver_virtual_file_io_mode is not None:
                 ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
 
+            if self.pageserver_wal_receiver_protocol is not None:
+                key, value = PageserverWalReceiverProtocol.to_config_key_value(
+                    self.pageserver_wal_receiver_protocol
+                )
+                if key not in ps_cfg:
+                    ps_cfg[key] = value
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"])
diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index e965aae5a0..4c21e799c8 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -15,7 +15,14 @@ from fixtures.neon_fixtures import (
 
 @pytest.mark.timeout(600)
 @pytest.mark.parametrize("shard_count", [1, 8, 32])
-@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [
+        "vanilla",
+        "interpreted-bincode-compressed",
+        "interpreted-protobuf-compressed",
+    ],
+)
 def test_sharded_ingest(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
@@ -27,14 +34,42 @@ def test_sharded_ingest(
     and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
     (shard_count=1) to the sharded case indicates the overhead of sharding.
     """
-    neon_env_builder.pageserver_config_override = (
-        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
-    )
-
     ROW_COUNT = 100_000_000  # about 7 GB of WAL
 
     neon_env_builder.num_pageservers = shard_count
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+
+    for ps in env.pageservers:
+        if wal_receiver_protocol == "vanilla":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "vanilla",
+                    }
+                }
+            )
+        elif wal_receiver_protocol == "interpreted-bincode-compressed":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "interpreted",
+                        "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+                    }
+                }
+            )
+        elif wal_receiver_protocol == "interpreted-protobuf-compressed":
+            ps.patch_config_toml_nonrecursive(
+                {
+                    "wal_receiver_protocol": {
+                        "type": "interpreted",
+                        "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
+                    }
+                }
+            )
+        else:
+            raise AssertionError("Test must use explicit wal receiver protocol config")
+
+    env.start()
 
     # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
     # the storage controller doesn't mess with shard placements.
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 79fd256304..302a8fd0d1 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -8,6 +8,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -27,8 +28,13 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 
 
 @skip_in_debug_build("only run with release build")
-@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
-def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_pageserver_compaction_smoke(
+    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
     a small number of rows and manually instructs the pageserver to run compaction
@@ -37,10 +43,12 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder, wal_recei
     observed bounds.
     """
 
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
+
     # Effectively disable the page cache to rely only on image layers
     # to shorten reads.
-    neon_env_builder.pageserver_config_override = f"""
-page_cache_size=10; wal_receiver_protocol='{wal_receiver_protocol}'
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
 """
 
     env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 70e71d99cd..6b9dcbba07 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_cli import WalCraft
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
 
 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
@@ -19,13 +19,16 @@ from fixtures.neon_fixtures import NeonEnvBuilder
         "wal_record_crossing_segment_followed_by_small_one",
     ],
 )
-@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
 def test_crafted_wal_end(
-    neon_env_builder: NeonEnvBuilder, wal_type: str, wal_receiver_protocol: str
+    neon_env_builder: NeonEnvBuilder,
+    wal_type: str,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
-    neon_env_builder.pageserver_config_override = (
-        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
-    )
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
 
     env = neon_env_builder.init_start()
     env.create_branch("test_crafted_wal_end")
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 1d86c353be..b235da0bc7 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
 
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, check_restored_datadir_content
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
+    check_restored_datadir_content,
+)
 
 
 # Test subtransactions
@@ -10,11 +14,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, check_restored_datadir_conten
 # maintained in the pageserver, so subtransactions are not very exciting for
 # Neon. They are included in the commit record though and updated in the
 # CLOG.
-@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
 def test_subxacts(neon_env_builder: NeonEnvBuilder, test_output_dir, wal_receiver_protocol):
-    neon_env_builder.pageserver_config_override = (
-        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
-    )
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
 
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 094b10b576..b32b028fa1 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -11,7 +11,13 @@ import pytest
 import toml
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
-from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverWalReceiverProtocol,
+    Safekeeper,
+)
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.utils import skip_in_debug_build
 
@@ -622,12 +628,15 @@ async def run_segment_init_failure(env: NeonEnv):
 # Test (injected) failure during WAL segment init.
 # https://github.com/neondatabase/neon/issues/6401
 # https://github.com/neondatabase/neon/issues/6402
-@pytest.mark.parametrize("wal_receiver_protocol", ["vanilla", "interpreted"])
-def test_segment_init_failure(neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: str):
+@pytest.mark.parametrize(
+    "wal_receiver_protocol",
+    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
+)
+def test_segment_init_failure(
+    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+):
     neon_env_builder.num_safekeepers = 1
-    neon_env_builder.pageserver_config_override = (
-        f"wal_receiver_protocol = '{wal_receiver_protocol}'"
-    )
+    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
     env = neon_env_builder.init_start()
 
     asyncio.run(run_segment_init_failure(env))

From 8fdf786217170192d383211f6e3fe0283ce5036d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 27 Nov 2024 13:46:23 +0000
Subject: [PATCH 16/29] pageserver: add tenant config override for wal receiver
 proto (#9888)

## Problem

Can't change protocol at tenant granularity.

## Summary of changes

Add tenant config level override for wal receiver protocol.

## Links

Related: https://github.com/neondatabase/neon/issues/9336
Epic: https://github.com/neondatabase/neon/issues/9329
---
 control_plane/src/pageserver.rs                |  5 +++++
 libs/pageserver_api/src/config.rs              |  3 +++
 libs/pageserver_api/src/models.rs              |  2 ++
 pageserver/src/tenant.rs                       |  1 +
 pageserver/src/tenant/config.rs                |  8 ++++++++
 pageserver/src/tenant/timeline.rs              | 18 +++++++++++++++++-
 .../regress/test_attach_tenant_config.py       |  4 ++++
 7 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index ae5e22ddc6..1d1455b95b 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -415,6 +415,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'timeline_offloading' as bool")?,
+            wal_receiver_protocol_override: settings
+                .remove("wal_receiver_protocol_override")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("parse `wal_receiver_protocol_override` from json")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 0abca5cdc2..721d97404b 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -278,6 +278,8 @@ pub struct TenantConfigToml {
     /// Enable auto-offloading of timelines.
     /// (either this flag or the pageserver-global one need to be set)
     pub timeline_offloading: bool,
+
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }
 
 pub mod defaults {
@@ -510,6 +512,7 @@ impl Default for TenantConfigToml {
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
             timeline_offloading: false,
+            wal_receiver_protocol_override: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1b86bfd91a..42c5d10c05 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,6 +23,7 @@ use utils::{
     completion,
     id::{NodeId, TenantId, TimelineId},
     lsn::Lsn,
+    postgres_client::PostgresClientProtocol,
     serde_system_time,
 };
 
@@ -352,6 +353,7 @@ pub struct TenantConfig {
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
     pub timeline_offloading: Option<bool>,
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }
 
 /// The policy for the aux file storage.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0214ee68fa..bddcb534a1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5344,6 +5344,7 @@ pub(crate) mod harness {
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                 timeline_offloading: Some(tenant_conf.timeline_offloading),
+                wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 4d6176bfd9..5d3ac5a8e3 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -19,6 +19,7 @@ use serde_json::Value;
 use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
+use utils::postgres_client::PostgresClientProtocol;
 
 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub(crate) enum AttachmentMode {
@@ -353,6 +354,9 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
     pub timeline_offloading: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }
 
 impl TenantConfOpt {
@@ -418,6 +422,9 @@ impl TenantConfOpt {
             timeline_offloading: self
                 .lazy_slru_download
                 .unwrap_or(global_conf.timeline_offloading),
+            wal_receiver_protocol_override: self
+                .wal_receiver_protocol_override
+                .or(global_conf.wal_receiver_protocol_override),
         }
     }
 }
@@ -472,6 +479,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
             timeline_offloading: value.timeline_offloading,
+            wal_receiver_protocol_override: value.wal_receiver_protocol_override,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c1ff0f426d..afd4664d01 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -50,6 +50,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
     fs_ext, pausable_failpoint,
+    postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
 use wal_decoder::serialized_batch::SerializedValueBatch;
@@ -2178,6 +2179,21 @@ impl Timeline {
             )
     }
 
+    /// Resolve the effective WAL receiver protocol to use for this tenant.
+    ///
+    /// Priority order is:
+    /// 1. Tenant config override
+    /// 2. Default value for tenant config override
+    /// 3. Pageserver config override
+    /// 4. Pageserver config default
+    pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .wal_receiver_protocol_override
+            .or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
+            .unwrap_or(self.conf.wal_receiver_protocol)
+    }
+
     pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -2470,7 +2486,7 @@ impl Timeline {
         *guard = Some(WalReceiver::start(
             Arc::clone(self),
             WalReceiverConf {
-                protocol: self.conf.wal_receiver_protocol,
+                protocol: self.resolve_wal_receiver_protocol(),
                 wal_connect_timeout,
                 lagging_wal_timeout,
                 max_lsn_wal_lag,
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 5744c445f6..670c2698f5 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -174,6 +174,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "lsn_lease_length": "1m",
         "lsn_lease_length_for_ts": "5s",
         "timeline_offloading": True,
+        "wal_receiver_protocol_override": {
+            "type": "interpreted",
+            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+        },
     }
 
     vps_http = env.storage_controller.pageserver_api()

From e4f437a354cc42bcbb081f72dffa8987932459f3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 27 Nov 2024 14:54:14 +0100
Subject: [PATCH 17/29] pageserver: add relsize cache metrics (#9890)

## Problem

We don't have any observability for the relation size cache. We have
seen cache misses cause significant performance impact with high
relation counts.

Touches #9855.

## Summary of changes

Adds the following metrics:

* `pageserver_relsize_cache_entries`
* `pageserver_relsize_cache_hits`
* `pageserver_relsize_cache_misses`
* `pageserver_relsize_cache_misses_old`
---
 pageserver/src/metrics.rs           | 29 +++++++++++++++++++++++++++++
 pageserver/src/pgdatadir_mapping.rs | 15 +++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5ce3ae6cf7..78a157f51e 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -662,6 +662,35 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
+pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_relsize_cache_entries",
+        "Number of entries in the relation size cache",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
+        .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses",
+        "Relation size cache misses",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_CACHE_MISSES_OLD: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_cache_misses_old",
+        "Relation size cache misses where the lookup LSN is older than the last relation update"
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index c491bfe650..4f42427276 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -10,6 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id,
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -1129,9 +1132,12 @@ impl Timeline {
         let rel_size_cache = self.rel_size_cache.read().unwrap();
         if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
             if lsn >= *cached_lsn {
+                RELSIZE_CACHE_HITS.inc();
                 return Some(*nblocks);
             }
+            RELSIZE_CACHE_MISSES_OLD.inc();
         }
+        RELSIZE_CACHE_MISSES.inc();
         None
     }
 
@@ -1156,6 +1162,7 @@ impl Timeline {
             }
             hash_map::Entry::Vacant(entry) => {
                 entry.insert((lsn, nblocks));
+                RELSIZE_CACHE_ENTRIES.inc();
             }
         }
     }
@@ -1163,13 +1170,17 @@ impl Timeline {
     /// Store cached relation size
     pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.insert(tag, (lsn, nblocks));
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
     }
 
     /// Remove cached relation size
     pub fn remove_cached_rel_size(&self, tag: &RelTag) {
         let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.remove(tag);
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
     }
 }
 

From 23f5a2714631e6e211a5afac23c21076399e3c8a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 27 Nov 2024 11:07:39 -0500
Subject: [PATCH 18/29] fix(storage-scrubber): valid layermap error degrades to
 warning (#9902)

Valid layer assumption is a necessary condition for a layer map to be
valid. It's a stronger check imposed by gc-compaction than the actual
valid layermap definition. Actually, the system can work as long as
there are no overlapping layer maps. Therefore, we degrade that into a
warning.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/checks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 525f412b56..8d855d263c 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -128,7 +128,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 
                     let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
                     if let Some(err) = check_valid_layermap(&layer_names) {
-                        result.errors.push(format!(
+                        result.warnings.push(format!(
                             "index_part.json contains invalid layer map structure: {err}"
                         ));
                     }

From cc37fa0f33df21cb0adfff922e199b2ef1d30207 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 27 Nov 2024 18:16:41 +0100
Subject: [PATCH 19/29] pageserver: add metrics for unknown `ClearVmBits` pages
 (#9911)

## Problem

When ingesting implicit `ClearVmBits` operations, we silently drop the
writes if the relation or page is unknown. There are implicit
assumptions around VM pages wrt. explicit/implicit updates, sharding,
and relation sizes, which can possibly drop writes incorrectly. Adding a
few metrics will allow us to investigate further and tighten up the
logic.

Touches #9855.

## Summary of changes

Add a `pageserver_wal_ingest_clear_vm_bits_unknown` metric to record
dropped `ClearVmBits` writes.

Also add comments clarifying the behavior of relation sizes on non-zero
shards.
---
 pageserver/src/metrics.rs           |  7 +++++
 pageserver/src/pgdatadir_mapping.rs | 17 ++++++++--
 pageserver/src/walingest.rs         | 49 +++++++++++++++++++++--------
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 78a157f51e..86be97587f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2144,6 +2144,7 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
     pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
+    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2172,6 +2173,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Total number of zero gap blocks written on relation extends"
     )
     .expect("failed to define a metric"),
+    clear_vm_bits_unknown: register_int_counter_vec!(
+        "pageserver_wal_ingest_clear_vm_bits_unknown",
+        "Number of ignored ClearVmBits operations due to unknown pages/relations",
+        &["entity"],
+    )
+    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4f42427276..d48a1ba117 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -392,7 +392,9 @@ impl Timeline {
         result
     }
 
-    // Get size of a database in blocks
+    /// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
+    /// other shards, by only accounting for relations the shard has pages for, and only accounting
+    /// for pages up to the highest page number it has stored.
     pub(crate) async fn get_db_size(
         &self,
         spcnode: Oid,
@@ -411,7 +413,10 @@ impl Timeline {
         Ok(total_blocks)
     }
 
-    /// Get size of a relation file
+    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
+    ///
+    /// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+    /// page number stored in the shard.
     pub(crate) async fn get_rel_size(
         &self,
         tag: RelTag,
@@ -447,7 +452,10 @@ impl Timeline {
         Ok(nblocks)
     }
 
-    /// Does relation exist?
+    /// Does the relation exist?
+    ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
     pub(crate) async fn get_rel_exists(
         &self,
         tag: RelTag,
@@ -481,6 +489,9 @@ impl Timeline {
 
     /// Get a list of all existing relations in given tablespace and database.
     ///
+    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
+    /// the shard stores pages for.
+    ///
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index ad6ccbc854..d568da596a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,14 +334,32 @@ impl WalIngest {
         // replaying it would fail to find the previous image of the page, because
         // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
         // record if it doesn't.
-        let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+        //
+        // TODO: analyze the metrics and tighten this up accordingly. This logic
+        // implicitly assumes that VM pages see explicit WAL writes before
+        // implicit ClearVmBits, and will otherwise silently drop updates.
+        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
+            WAL_INGEST
+                .clear_vm_bits_unknown
+                .with_label_values(&["relation"])
+                .inc();
+            return Ok(());
+        };
         if let Some(blknum) = new_vm_blk {
             if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["new_page"])
+                    .inc();
                 new_vm_blk = None;
             }
         }
         if let Some(blknum) = old_vm_blk {
             if blknum >= vm_size {
+                WAL_INGEST
+                    .clear_vm_bits_unknown
+                    .with_label_values(&["old_page"])
+                    .inc();
                 old_vm_blk = None;
             }
         }
@@ -572,7 +590,8 @@ impl WalIngest {
                 modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -612,7 +631,8 @@ impl WalIngest {
                 )?;
                 vm_page_no += 1;
             }
-            let nblocks = get_relsize(modification, rel, ctx).await?;
+            // TODO: re-examine the None case here wrt. sharding; should we error?
+            let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1430,24 +1450,27 @@ impl WalIngest {
     }
 }
 
+/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
+///
+/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
+/// page number stored in the shard, or None if the shard does not have any pages for it.
 async fn get_relsize(
     modification: &DatadirModification<'_>,
     rel: RelTag,
     ctx: &RequestContext,
-) -> Result<BlockNumber, PageReconstructError> {
-    let nblocks = if !modification
+) -> Result<Option<BlockNumber>, PageReconstructError> {
+    if !modification
         .tline
         .get_rel_exists(rel, Version::Modified(modification), ctx)
         .await?
     {
-        0
-    } else {
-        modification
-            .tline
-            .get_rel_size(rel, Version::Modified(modification), ctx)
-            .await?
-    };
-    Ok(nblocks)
+        return Ok(None);
+    }
+    modification
+        .tline
+        .get_rel_size(rel, Version::Modified(modification), ctx)
+        .await
+        .map(Some)
 }
 
 #[allow(clippy::bool_assert_comparison)]

From 5c41707beefa99b53548530698dc2970dd876024 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 27 Nov 2024 19:05:46 +0100
Subject: [PATCH 20/29] proxy: promote two logs to error, fix multiline log
 (#9913)

* Promote two logs from mpsc send errors to error level. The channels
are unbounded and there shouldn't be errors.
* Fix one multiline log from anyhow::Error. Use Debug instead of
Display.
---
 proxy/src/context/mod.rs     | 18 +++++++++++-------
 proxy/src/context/parquet.rs |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 4ec04deb25..5c19a23e36 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{debug, info_span, Span};
+use tracing::{debug, error, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -415,9 +415,11 @@ impl RequestContextInner {
                 });
         }
         if let Some(tx) = self.sender.take() {
-            tx.send(RequestData::from(&*self))
-                .inspect_err(|e| debug!("tx send failed: {e}"))
-                .ok();
+            // If type changes, this error handling needs to be updated.
+            let tx: mpsc::UnboundedSender<RequestData> = tx;
+            if let Err(e) = tx.send(RequestData::from(&*self)) {
+                error!("log_connect channel send failed: {e}");
+            }
         }
     }
 
@@ -426,9 +428,11 @@ impl RequestContextInner {
         // Here we log the length of the session.
         self.disconnect_timestamp = Some(Utc::now());
         if let Some(tx) = self.disconnect_sender.take() {
-            tx.send(RequestData::from(&*self))
-                .inspect_err(|e| debug!("tx send failed: {e}"))
-                .ok();
+            // If type changes, this error handling needs to be updated.
+            let tx: mpsc::UnboundedSender<RequestData> = tx;
+            if let Err(e) = tx.send(RequestData::from(&*self)) {
+                error!("log_disconnect channel send failed: {e}");
+            }
         }
     }
 }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 9bf3a275bb..e328c6de79 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -398,7 +398,7 @@ async fn upload_parquet(
     .err();
 
     if let Some(err) = maybe_err {
-        tracing::warn!(%id, %err, "failed to upload request data");
+        tracing::error!(%id, error = ?err, "failed to upload request data");
     }
 
     Ok(buffer.writer())

From 9e3cb75bc785a87967f5a8c0f866f65808680b2e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 27 Nov 2024 13:30:54 -0500
Subject: [PATCH 21/29] fix(pageserver): flush deletion queue in `reload`
 shutdown mode (#9884)

## Problem

close https://github.com/neondatabase/neon/issues/9859

## Summary of changes

Ensure that the deletion queue gets fully flushed (i.e., the deletion
lists get applied) during a graceful shutdown.

It is still possible that an incomplete shutdown would leave deletion
list behind and cause race upon the next startup, but we assume this
will unlikely happen, and even if it happened, the pageserver should
already be at a tainted state and the tenant should be moved to a new
tenant with a new generation number.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/deletion_queue.rs          | 74 +++++++++++------------
 pageserver/src/tenant.rs                  | 12 ++++
 pageserver/src/tenant/mgr.rs              |  2 +-
 pageserver/src/tenant/timeline.rs         | 11 ++--
 pageserver/src/tenant/timeline/offload.rs |  2 +-
 5 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 37fa300467..e74c8ecf5a 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1144,18 +1144,24 @@ pub(crate) mod mock {
         rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
         executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
         cancel: CancellationToken,
+        executed: Arc<AtomicUsize>,
     }
 
     impl ConsumerState {
-        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) -> usize {
-            let mut executed = 0;
-
+        async fn consume(&mut self, remote_storage: &GenericRemoteStorage) {
             info!("Executing all pending deletions");
 
             // Transform all executor messages to generic frontend messages
-            while let Ok(msg) = self.executor_rx.try_recv() {
+            loop {
+                use either::Either;
+                let msg = tokio::select! {
+                    left = self.executor_rx.recv() => Either::Left(left),
+                    right = self.rx.recv() => Either::Right(right),
+                };
                 match msg {
-                    DeleterMessage::Delete(objects) => {
+                    Either::Left(None) => break,
+                    Either::Right(None) => break,
+                    Either::Left(Some(DeleterMessage::Delete(objects))) => {
                         for path in objects {
                             match remote_storage.delete(&path, &self.cancel).await {
                                 Ok(_) => {
@@ -1165,18 +1171,13 @@ pub(crate) mod mock {
                                     error!("Failed to delete {path}, leaking object! ({e})");
                                 }
                             }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                         }
                     }
-                    DeleterMessage::Flush(flush_op) => {
+                    Either::Left(Some(DeleterMessage::Flush(flush_op))) => {
                         flush_op.notify();
                     }
-                }
-            }
-
-            while let Ok(msg) = self.rx.try_recv() {
-                match msg {
-                    ListWriterQueueMessage::Delete(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Delete(op))) => {
                         let mut objects = op.objects;
                         for (layer, meta) in op.layers {
                             objects.push(remote_layer_path(
@@ -1198,33 +1199,27 @@ pub(crate) mod mock {
                                     error!("Failed to delete {path}, leaking object! ({e})");
                                 }
                             }
-                            executed += 1;
+                            self.executed.fetch_add(1, Ordering::Relaxed);
                         }
                     }
-                    ListWriterQueueMessage::Flush(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::Flush(op))) => {
                         op.notify();
                     }
-                    ListWriterQueueMessage::FlushExecute(op) => {
+                    Either::Right(Some(ListWriterQueueMessage::FlushExecute(op))) => {
                         // We have already executed all prior deletions because mock does them inline
                         op.notify();
                     }
-                    ListWriterQueueMessage::Recover(_) => {
+                    Either::Right(Some(ListWriterQueueMessage::Recover(_))) => {
                         // no-op in mock
                     }
                 }
-                info!("All pending deletions have been executed");
             }
-
-            executed
         }
     }
 
     pub struct MockDeletionQueue {
         tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
         executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        executed: Arc<AtomicUsize>,
-        remote_storage: Option<GenericRemoteStorage>,
-        consumer: std::sync::Mutex<ConsumerState>,
         lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
     }
 
@@ -1235,29 +1230,34 @@ pub(crate) mod mock {
 
             let executed = Arc::new(AtomicUsize::new(0));
 
+            let mut consumer = ConsumerState {
+                rx,
+                executor_rx,
+                cancel: CancellationToken::new(),
+                executed: executed.clone(),
+            };
+
+            tokio::spawn(async move {
+                if let Some(remote_storage) = &remote_storage {
+                    consumer.consume(remote_storage).await;
+                }
+            });
+
             Self {
                 tx,
                 executor_tx,
-                executed,
-                remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState {
-                    rx,
-                    executor_rx,
-                    cancel: CancellationToken::new(),
-                }),
                 lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
             }
         }
 
         #[allow(clippy::await_holding_lock)]
         pub async fn pump(&self) {
-            if let Some(remote_storage) = &self.remote_storage {
-                // Permit holding mutex across await, because this is only ever
-                // called once at a time in tests.
-                let mut locked = self.consumer.lock().unwrap();
-                let count = locked.consume(remote_storage).await;
-                self.executed.fetch_add(count, Ordering::Relaxed);
-            }
+            let (tx, rx) = tokio::sync::oneshot::channel();
+            self.executor_tx
+                .send(DeleterMessage::Flush(FlushOp { tx }))
+                .await
+                .expect("Failed to send flush message");
+            rx.await.ok();
         }
 
         pub(crate) fn new_client(&self) -> DeletionQueueClient {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bddcb534a1..339a3ca1bb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3215,6 +3215,18 @@ impl Tenant {
             }
         }
 
+        if let ShutdownMode::Reload = shutdown_mode {
+            tracing::info!("Flushing deletion queue");
+            if let Err(e) = self.deletion_queue_client.flush().await {
+                match e {
+                    DeletionQueueError::ShuttingDown => {
+                        // This is the only error we expect for now. In the future, if more error
+                        // variants are added, we should handle them here.
+                    }
+                }
+            }
+        }
+
         // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
         // them to continue to do work during their shutdown methods, e.g. flushing data.
         tracing::debug!("Cancelling CancellationToken");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 92b2200542..eb8191e43e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1960,7 +1960,7 @@ impl TenantManager {
             attempt.before_reset_tenant();
 
             let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Flush).await {
+            match tenant.shutdown(progress, ShutdownMode::Reload).await {
                 Ok(()) => {
                     slot_guard.drop_old_value().expect("it was just shutdown");
                 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index afd4664d01..730477a7f4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -894,10 +894,11 @@ pub(crate) enum ShutdownMode {
     /// While we are flushing, we continue to accept read I/O for LSNs ingested before
     /// the call to [`Timeline::shutdown`].
     FreezeAndFlush,
-    /// Only flush the layers to the remote storage without freezing any open layers. This is the
-    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
-    /// the generation number.
-    Flush,
+    /// Only flush the layers to the remote storage without freezing any open layers. Flush the deletion
+    /// queue. This is the mode used by ancestor detach and any other operations that reloads a tenant
+    /// but not increasing the generation number. Note that this mode cannot be used at tenant shutdown,
+    /// as flushing the deletion queue at that time will cause shutdown-in-progress errors.
+    Reload,
     /// Shut down immediately, without waiting for any open layers to flush.
     Hard,
 }
@@ -1818,7 +1819,7 @@ impl Timeline {
             }
         }
 
-        if let ShutdownMode::Flush = mode {
+        if let ShutdownMode::Reload = mode {
             // drain the upload queue
             self.remote_client.shutdown().await;
             if !self.remote_client.no_pending_work() {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 3595d743bc..3bfbfb5061 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
     }
 
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Flush).await;
+    timeline.shutdown(super::ShutdownMode::Reload).await;
 
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress

From da1daa2426aa592f5d57a13c4b09b5d21bcbeaf7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 27 Nov 2024 20:44:24 +0100
Subject: [PATCH 22/29] pageserver: only apply `ClearVmBits` on relevant shards
 (#9895)

# Problem

VM (visibility map) pages are stored and managed as any regular relation
page, in the VM fork of the main relation. They are also sharded like
other pages. Regular WAL writes to the VM pages (typically performed by
vacuum) are routed to the correct shard as usual. However, VM pages are
also updated via `ClearVmBits` metadata records emitted when main
relation pages are updated. These metadata records were sent to all
shards, like other metadata records. This had the following effects:

* On shards responsible for VM pages, the `ClearVmBits` applies as
expected.

* On shard 0, which knows about the VM relation and its size but doesn't
necessarily have any VM pages, the `ClearVmBits` writes may have been
applied without also having applied the explicit WAL writes to VM pages.

* If VM pages are spread across multiple shards (unlikely with 256MB
stripe size), all shards may have applied `ClearVmBits` if the pages
fall within their local view of the relation size, even for pages they
do not own.

* On other shards, this caused a relation size cache miss and a DbDir
and RelDir lookup before dropping the `ClearVmBits`. With many
relations, this could cause significant CPU overhead.

This is not believed to be a correctness problem, but this will be
verified in #9914.

Resolves #9855.

# Changes

Route `ClearVmBits` metadata records only to the shards responsible for
the VM pages.

Verification of the current VM handling and cleanup of incomplete VM
pages on shard 0 (and potentially elsewhere) is left as follow-up work.
---
 libs/wal_decoder/src/decoder.rs | 71 +++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 1895f25bfc..36c4b19266 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -4,6 +4,7 @@
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::pg_constants;
@@ -32,7 +33,8 @@ impl InterpretedWalRecord {
             FlushUncommittedRecords::No
         };
 
-        let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
+        let metadata_record =
+            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
         let batch = SerializedValueBatch::from_decoded_filtered(
             decoded,
             shard,
@@ -51,8 +53,13 @@ impl InterpretedWalRecord {
 }
 
 impl MetadataRecord {
-    fn from_decoded(
+    /// Builds a metadata record for this WAL record, if any.
+    ///
+    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// records are broadcast to all shards for simplicity, but this should be improved.
+    fn from_decoded_filtered(
         decoded: &DecodedWALRecord,
+        shard: &ShardIdentity,
         next_record_lsn: Lsn,
         pg_version: u32,
     ) -> anyhow::Result<Option<MetadataRecord>> {
@@ -61,26 +68,27 @@ impl MetadataRecord {
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
 
-        match decoded.xl_rmid {
+        // First, generate metadata records from the decoded WAL record.
+        let mut metadata_record = match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
-                Self::decode_heapam_record(&mut buf, decoded, pg_version)
+                Self::decode_heapam_record(&mut buf, decoded, pg_version)?
             }
-            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?,
             // Handle other special record types
-            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
-            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?,
+            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?,
             pg_constants::RM_TBLSPC_ID => {
                 tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-                Ok(None)
+                None
             }
-            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?,
             pg_constants::RM_XACT_ID => {
-                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xact_record(&mut buf, decoded, next_record_lsn)?
             }
             pg_constants::RM_MULTIXACT_ID => {
-                Self::decode_multixact_record(&mut buf, decoded, pg_version)
+                Self::decode_multixact_record(&mut buf, decoded, pg_version)?
             }
-            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
+            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?,
             // This is an odd duck. It needs to go to all shards.
             // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
             // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
@@ -89,19 +97,48 @@ impl MetadataRecord {
             // Alternatively, one can make the checkpoint part of the subscription protocol
             // to the pageserver. This should work fine, but can be done at a later point.
             pg_constants::RM_XLOG_ID => {
-                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
+                Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)?
             }
             pg_constants::RM_LOGICALMSG_ID => {
-                Self::decode_logical_message_record(&mut buf, decoded)
+                Self::decode_logical_message_record(&mut buf, decoded)?
             }
-            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
-            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
+            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?,
+            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?,
             _unexpected => {
                 // TODO: consider failing here instead of blindly doing something without
                 // understanding the protocol
-                Ok(None)
+                None
+            }
+        };
+
+        // Next, filter the metadata record by shard.
+
+        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+        // of the main relation. These are sharded and managed just like regular relation pages.
+        // See: https://github.com/neondatabase/neon/issues/9855
+        if let Some(
+            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+        ) = metadata_record
+        {
+            let is_local_vm_page = |heap_blk| {
+                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+            };
+            // Send the old and new VM page updates to their respective shards.
+            clear_vm_bits.old_heap_blkno = clear_vm_bits
+                .old_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            clear_vm_bits.new_heap_blkno = clear_vm_bits
+                .new_heap_blkno
+                .filter(|&blkno| is_local_vm_page(blkno));
+            // If neither VM page belongs to this shard, discard the record.
+            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
+                metadata_record = None
             }
         }
+
+        Ok(metadata_record)
     }
 
     fn decode_heapam_record(

From 8173dc600ad68872f4e488c753f59b8a1e2093aa Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 28 Nov 2024 08:32:22 +0200
Subject: [PATCH 23/29] proxy: spawn cancellation checks in the background
 (#9918)

## Problem
For cancellation, a connection is open during all the cancel checks.
## Summary of changes
Spawn cancellation checks in the background, and close connection
immediately.
Use task_tracker for cancellation checks.
---
 proxy/src/cancellation.rs           | 15 ++++++++-----
 proxy/src/console_redirect_proxy.rs | 35 +++++++++++++++++++++--------
 proxy/src/proxy/mod.rs              | 35 +++++++++++++++++++++--------
 proxy/src/redis/notifications.rs    |  2 +-
 proxy/src/serverless/mod.rs         |  9 ++++++++
 proxy/src/serverless/websocket.rs   |  3 +++
 6 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74415f1ffe..91e198bf88 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -99,16 +99,17 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
     /// check_allowed - if true, check if the IP is allowed to cancel the query
+    /// return Result primarily for tests
     pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
-        peer_addr: &IpAddr,
+        peer_addr: IpAddr,
         check_allowed: bool,
     ) -> Result<(), CancelError> {
         // TODO: check for unspecified address is only for backward compatibility, should be removed
         if !peer_addr.is_unspecified() {
-            let subnet_key = match *peer_addr {
+            let subnet_key = match peer_addr {
                 IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
                 IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
             };
@@ -141,9 +142,11 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 return Ok(());
             }
 
-            match self.client.try_publish(key, session_id, *peer_addr).await {
+            match self.client.try_publish(key, session_id, peer_addr).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
+                    // log it here since cancel_session could be spawned in a task
+                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
                     return Err(CancelError::IO(std::io::Error::new(
                         std::io::ErrorKind::Other,
                         e.to_string(),
@@ -154,8 +157,10 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         };
 
         if check_allowed
-            && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice())
+            && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice())
         {
+            // log it here since cancel_session could be spawned in a task
+            tracing::warn!("IP is not allowed to cancel the query: {key}");
             return Err(CancelError::IpNotAllowed);
         }
 
@@ -306,7 +311,7 @@ mod tests {
                     cancel_key: 0,
                 },
                 Uuid::new_v4(),
-                &("127.0.0.1".parse().unwrap()),
+                "127.0.0.1".parse().unwrap(),
                 true,
             )
             .await
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index b910b524b1..8f78df1964 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -35,6 +35,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -48,6 +49,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
@@ -96,6 +98,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -127,10 +130,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -142,6 +147,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -161,15 +167,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 7fe67e43de..956036d29d 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -69,6 +69,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -82,6 +83,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
@@ -133,6 +135,7 @@ pub async fn task_main(
                 ClientMode::Tcp,
                 endpoint_rate_limiter2,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -164,10 +167,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -250,6 +255,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -270,15 +276,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 65008ae943..9ac07b7e90 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -149,7 +149,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     .cancel_session(
                         cancel_session.cancel_key_data,
                         uuid::Uuid::nil(),
-                        &peer_addr,
+                        peer_addr,
                         cancel_session.peer_addr.is_some(),
                     )
                     .await
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 77025f419d..80b42f9e55 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -132,6 +132,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
         if let Err(e) = conn.set_nodelay(true) {
@@ -160,6 +161,7 @@ pub async fn task_main(
         let connections2 = connections.clone();
         let cancellation_handler = cancellation_handler.clone();
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+        let cancellations = cancellations.clone();
         connections.spawn(
             async move {
                 let conn_token2 = conn_token.clone();
@@ -188,6 +190,7 @@ pub async fn task_main(
                     config,
                     backend,
                     connections2,
+                    cancellations,
                     cancellation_handler,
                     endpoint_rate_limiter,
                     conn_token,
@@ -313,6 +316,7 @@ async fn connection_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
+    cancellations: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
@@ -353,6 +357,7 @@ async fn connection_handler(
 
             // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
             // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let cancellations = cancellations.clone();
             let handler = connections.spawn(
                 request_handler(
                     req,
@@ -364,6 +369,7 @@ async fn connection_handler(
                     conn_info2.clone(),
                     http_request_token,
                     endpoint_rate_limiter.clone(),
+                    cancellations,
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
@@ -411,6 +417,7 @@ async fn request_handler(
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellations: TaskTracker,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let host = request
         .headers()
@@ -436,6 +443,7 @@ async fn request_handler(
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
+        let cancellations = cancellations.clone();
         ws_connections.spawn(
             async move {
                 if let Err(e) = websocket::serve_websocket(
@@ -446,6 +454,7 @@ async fn request_handler(
                     cancellation_handler,
                     endpoint_rate_limiter,
                     host,
+                    cancellations,
                 )
                 .await
                 {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 4088fea835..bdb83fe6be 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -123,6 +123,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -131,6 +132,7 @@ pub(crate) async fn serve_websocket(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
@@ -149,6 +151,7 @@ pub(crate) async fn serve_websocket(
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
         conn_gauge,
+        cancellations,
     ))
     .await;
 

From e82f7f0dfc1571ddbbb4ff37c1c94579a7101834 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 10:11:08 +0000
Subject: [PATCH 24/29] remote_storage/abs: count 404 and 304 for get as ok for
 metrics (#9912)

## Problem

We currently see elevated levels of errors for GetBlob requests. This is
because 404 and 304 are counted as errors for metric reporting.

## Summary of Changes

Bring the implementation in line with the S3 client and treat 404 and
304 responses as ok for metric purposes.

Related: https://github.com/neondatabase/cloud/issues/20666
---
 libs/remote_storage/src/azure_blob.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index ae0a94295c..840917ef68 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -220,6 +220,11 @@ impl AzureBlobStorage {
         let started_at = ScopeGuard::into_inner(started_at);
         let outcome = match &download {
             Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
             Err(_) => AttemptOutcome::Err,
         };
         crate::metrics::BUCKET_METRICS

From 70780e310c9640650eeb8b5cb0838bebd1c6c0ff Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 28 Nov 2024 16:48:18 +0100
Subject: [PATCH 25/29] Makefile: build pg_visibility (#9922)

Build the `pg_visibility` extension for use with `neon_local`. This is
useful to inspect the visibility map for debugging.

Touches #9914.
---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index dc67b87239..9cffc74508 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pg_visibility $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"

From eb5d832e6fea0e1c3c14b9e6024fce916c3f1c32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:49:30 +0100
Subject: [PATCH 26/29] Update rust to 1.83.0, also update cargo adjacent tools
 (#9926)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.83.0/).

Also update `cargo-hakari`, `cargo-deny`, `cargo-hack` and
`cargo-nextest` to their latest versions.

Prior update was in #9445.
---
 build-tools.Dockerfile | 18 +++++++++---------
 rust-toolchain.toml    |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 4f491afec5..2671702697 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,9 +57,9 @@ RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
     chmod -R 755 /pgcopydb && \
     chown -R nonroot:nonroot /pgcopydb
-        
-COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
-COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
+
+COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
+COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 # System deps
 #
@@ -258,14 +258,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.82.0
+ENV RUSTC_VERSION=1.83.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.30
-ARG CARGO_DENY_VERSION=0.16.1
-ARG CARGO_HACK_VERSION=0.6.31
-ARG CARGO_NEXTEST_VERSION=0.9.72
+ARG CARGO_HAKARI_VERSION=0.9.33
+ARG CARGO_DENY_VERSION=0.16.2
+ARG CARGO_HACK_VERSION=0.6.33
+ARG CARGO_NEXTEST_VERSION=0.9.85
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -289,7 +289,7 @@ RUN whoami \
     && cargo --version --verbose \
     && rustup --version --verbose \
     && rustc --version --verbose \
-    && clang --version 
+    && clang --version
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
     LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 92b7929c7f..f0661a32e0 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.82.0"
+channel = "1.83.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From eb520a14ce12dc16f33f39964632982f6c14b9f3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 17:38:47 +0000
Subject: [PATCH 27/29] pageserver: return correct LSN for interpreted proto
 keep alive responses (#9928)

## Problem

For the interpreted proto the pageserver is not returning the correct
LSN
in replies to keep alive requests. This is because the interpreted
protocol arm
was not updating `last_rec_lsn`.

## Summary of changes

* Return correct LSN in keep-alive responses
* Fix shard field in wal sender traces
---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs    | 4 ++++
 safekeeper/src/handler.rs                                    | 5 +++--
 safekeeper/src/wal_service.rs                                | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 31cf1b6307..d90ffbfa2c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -454,6 +454,10 @@ pub(super) async fn handle_walreceiver_connection(
                     timeline.get_last_record_lsn()
                 );
 
+                if let Some(lsn) = next_record_lsn {
+                    last_rec_lsn = lsn;
+                }
+
                 Some(streaming_lsn)
             }
 
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 22f33b17e0..8dd2929a03 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -212,8 +212,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                 );
 
             if let Some(shard) = self.shard.as_ref() {
-                tracing::Span::current()
-                    .record("shard", tracing::field::display(shard.shard_slug()));
+                if let Some(slug) = shard.shard_slug().strip_prefix("-") {
+                    tracing::Span::current().record("shard", tracing::field::display(slug));
+                }
             }
 
             Ok(())
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ab54d4cce..5248d545db 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -44,7 +44,7 @@ pub async fn task_main(
                     error!("connection handler exited: {}", err);
                 }
             }
-            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty, shard = field::Empty)),
         );
     }
 }

From e04dd3be0b6bd6702fa6e3301c9b7202d72ccc1c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 28 Nov 2024 19:02:57 +0000
Subject: [PATCH 28/29] test_runner: rerun all failed tests (#9917)

## Problem

Currently, we rerun only known flaky tests. This approach was chosen to
reduce the number of tests that go unnoticed (by forcing people to take
a look at failed tests and rerun the job manually), but it has some
drawbacks:
- In PRs, people tend to push new changes without checking failed tests
(that's ok)
- In the main, tests are just restarted without checking
(understandable)
- Parametrised tests become flaky one by one, i.e. if `test[1]` is flaky
`, test[2]` is not marked as flaky automatically (which may or may not
be the case).

I suggest rerunning all failed tests to increase the stability of GitHub
jobs and using the Grafana Dashboard with flaky tests for deeper
analysis.

## Summary of changes
- Rerun all failed tests twice at max
---
 .../actions/run-python-test-set/action.yml    |  17 +-
 .github/workflows/_build-and-test-locally.yml |   2 +-
 poetry.lock                                   |  12 +-
 pyproject.toml                                |   2 +-
 scripts/flaky_tests.py                        | 147 ------------------
 test_runner/conftest.py                       |   2 +-
 test_runner/fixtures/flaky.py                 |  78 ----------
 test_runner/fixtures/paths.py                 |   2 +-
 test_runner/fixtures/reruns.py                |  31 ++++
 9 files changed, 46 insertions(+), 247 deletions(-)
 delete mode 100755 scripts/flaky_tests.py
 delete mode 100644 test_runner/fixtures/flaky.py
 create mode 100644 test_runner/fixtures/reruns.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 275f161019..1159627302 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,8 +36,8 @@ inputs:
     description: 'Region name for real s3 tests'
     required: false
     default: ''
-  rerun_flaky:
-    description: 'Whether to rerun flaky tests'
+  rerun_failed:
+    description: 'Whether to rerun failed tests'
     required: false
     default: 'false'
   pg_version:
@@ -108,7 +108,7 @@ runs:
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
-        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
@@ -154,15 +154,8 @@ runs:
           EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
         fi
 
-        if [ "${RERUN_FLAKY}" == "true" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
-
-          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        if [ "${RERUN_FAILED}" == "true" ]; then
+          EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS"
         fi
 
         # We use pytest-split plugin to run benchmarks in parallel on different CI runners
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index bdf7c07c6a..42c32a23e3 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -293,7 +293,7 @@ jobs:
           run_with_real_s3: true
           real_s3_bucket: neon-github-ci-tests
           real_s3_region: eu-central-1
-          rerun_flaky: true
+          rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
diff --git a/poetry.lock b/poetry.lock
index e2fca7be47..59ae5cf1ca 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2563,18 +2563,18 @@ pytest = "*"
 
 [[package]]
 name = "pytest-rerunfailures"
-version = "13.0"
+version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
-    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
+    {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
+    {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
 ]
 
 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=7"
+pytest = ">=7.4,<8.2.2 || >8.2.2"
 
 [[package]]
 name = "pytest-split"
@@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"
+content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9"
diff --git a/pyproject.toml b/pyproject.toml
index ccd3ab1864..01d15ee6bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.10.11"
-pytest-rerunfailures = "^13.0"
+pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
deleted file mode 100755
index 3fb668ed2d..0000000000
--- a/scripts/flaky_tests.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#! /usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-from collections import defaultdict
-from typing import TYPE_CHECKING
-
-import psycopg2
-import psycopg2.extras
-import toml
-
-if TYPE_CHECKING:
-    from typing import Any
-
-FLAKY_TESTS_QUERY = """
-    SELECT
-        DISTINCT parent_suite, suite, name
-    FROM results
-    WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR flaky
-        )
-    ;
-"""
-
-
-def main(args: argparse.Namespace):
-    connstr = args.connstr
-    interval_days = args.days
-    output = args.output
-
-    build_type = args.build_type
-    pg_version = args.pg_version
-
-    res: defaultdict[str, defaultdict[str, dict[str, bool]]]
-    res = defaultdict(lambda: defaultdict(dict))
-
-    try:
-        logging.info("connecting to the database...")
-        with psycopg2.connect(connstr, connect_timeout=30) as conn:
-            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                logging.info("fetching flaky tests...")
-                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
-                rows = cur.fetchall()
-    except psycopg2.OperationalError as exc:
-        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
-        rows = []
-
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
-    # use it to parametrize test name along with build_type and pg_version
-    #
-    # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
-        "",
-        "tokio-epoll-uring",
-    ):
-        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
-    else:
-        pageserver_virtual_file_io_engine_parameter = ""
-
-    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
-    def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
-        """Duplicated from parametrize.py"""
-        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
-        if toml_table is None:
-            return None
-        v = toml.loads(toml_table)
-        assert isinstance(v, dict)
-        return v
-
-    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
-    if (
-        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
-    ) is not None:
-        pageserver_default_tenant_config_compaction_algorithm_parameter = (
-            f"-{explicit_default['kind']}"
-        )
-
-    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        if row["name"].endswith("]"):
-            parametrized_test = row["name"].replace(
-                "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
-            )
-        else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
-
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
-
-    logging.info(f"saving results to {output.name}")
-    json.dump(res, output, indent=2)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
-    parser.add_argument(
-        "--output",
-        type=argparse.FileType("w"),
-        default="flaky.json",
-        help="path to output json file (default: flaky.json)",
-    )
-    parser.add_argument(
-        "--days",
-        required=False,
-        default=10,
-        type=int,
-        help="how many days to look back for flaky tests (default: 10)",
-    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
-    parser.add_argument(
-        "connstr",
-        help="connection string to the test results database",
-    )
-    args = parser.parse_args()
-
-    level = logging.INFO
-    logging.basicConfig(
-        format="%(message)s",
-        level=level,
-    )
-
-    main(args)
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 84eda52d33..887bfef478 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -13,5 +13,5 @@ pytest_plugins = (
     "fixtures.pg_stats",
     "fixtures.compare_fixtures",
     "fixtures.slow",
-    "fixtures.flaky",
+    "fixtures.reruns",
 )
diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py
deleted file mode 100644
index 01634a29c5..0000000000
--- a/test_runner/fixtures/flaky.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-import json
-from collections.abc import MutableMapping
-from pathlib import Path
-from typing import TYPE_CHECKING, cast
-
-import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
-from allure_commons.types import LabelType
-from allure_pytest.utils import allure_name, allure_suite_labels
-
-from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-    from typing import Any
-
-
-"""
-The plugin reruns flaky tests.
-It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
-
-Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
-"""
-
-
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--flaky-tests-json",
-        action="store",
-        type=Path,
-        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
-    )
-
-
-def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
-    if not config.getoption("--flaky-tests-json"):
-        return
-
-    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
-    flaky_json = config.getoption("--flaky-tests-json")
-    if not flaky_json.exists():
-        return
-
-    content = flaky_json.read_text()
-    try:
-        flaky_tests = json.loads(content)
-    except ValueError:
-        log.error(f"Can't parse {content} as json")
-        return
-
-    for item in items:
-        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
-        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
-        allure_labels = dict(allure_suite_labels(item))
-        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
-        suite = str(allure_labels.get(LabelType.SUITE))
-        params = item.callspec.params if hasattr(item, "callspec") else {}
-        name = allure_name(item, params)
-
-        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
-            # Rerun 3 times = 1 original run + 2 reruns
-            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
-            item.add_marker(pytest.mark.flaky(reruns=2))
-
-            # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
-            #   we can workaround it by setting `timeout_func_only` to True[1].
-            # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
-            #   but we still can do it using pytest marker.
-            #
-            # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
-            # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
-            timeout_marker = item.get_closest_marker("timeout")
-            if timeout_marker is not None:
-                kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
-                kwargs["func_only"] = True
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 1c71abea19..80777d65e9 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No
     test_name = request.node.name
     test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}"
 
-    # We rerun flaky tests multiple times, use a separate directory for each run.
+    # We rerun failed tests multiple times, use a separate directory for each run.
     if (suffix := getattr(request.node, "execution_count", None)) is not None:
         test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
 
diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py
new file mode 100644
index 0000000000..f2a25ae8f6
--- /dev/null
+++ b/test_runner/fixtures/reruns.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING, cast
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import Any
+
+    from _pytest.config import Config
+
+
+def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
+    # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
+    #   we can workaround it by setting `timeout_func_only` to True[1].
+    # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
+    #   but we still can do it using pytest marker.
+    #
+    # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
+    # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
+
+    if not config.getoption("--reruns"):
+        return
+
+    for item in items:
+        timeout_marker = item.get_closest_marker("timeout")
+        if timeout_marker is not None:
+            kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
+            kwargs["func_only"] = True

From 42fb3c4d30bf93ad0ad85bbd636a4262d205f673 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 28 Nov 2024 22:38:30 +0100
Subject: [PATCH 29/29] fix(compute_ctl): Allow usage of DB names with
 whitespaces (#9919)

## Problem

We used `set_path()` to replace the database name in the connection
string. It automatically does url-safe encoding if the path is not
already encoded, but it does it as per the URL standard, which assumes
that tabs can be safely removed from the path without changing the
meaning of the URL. See, e.g.,
https://url.spec.whatwg.org/#concept-basic-url-parser. It also breaks
for DBs with properly %-encoded names, like with `%20`, as they are kept
intact, but actually should be escaped.

Yet, this is not true for Postgres, where it's completely valid to have
trailing tabs in the database name.

I think this is the PR that caused this regression
https://github.com/neondatabase/neon/pull/9717, as it switched from
`postgres::config::Config` back to `set_path()`.

This was fixed a while ago already [1], btw, I just haven't added a test
to catch this regression back then :(

## Summary of changes

This commit changes the code back to use
`postgres/tokio_postgres::Config` everywhere.

While on it, also do some changes around, as I had to touch this code:
1. Bump some logging from `debug` to `info` in the spec apply path. We
do not use `debug` in prod, and it was tricky to understand what was
going on with this bug in prod.
2. Refactor configuration concurrency calculation code so it was
reusable. Yet, still keep `1` in the case of reconfiguration. The
database can be actively used at this moment, so we cannot guarantee
that there will be enough spare connection slots, and the underlying
code won't handle connection errors properly.
3. Simplify the installed extensions code. It was spawning a blocking
task inside async function, which doesn't make much sense. Instead, just
have a main sync function and call it with `spawn_blocking` in the API
code -- the only place we need it to be async.
4. Add regression python test to cover this and related problems in the
future. Also, add more extensive testing of schema dump and DBs and
roles listing API.

[1]:
https://github.com/neondatabase/neon/commit/4d1e48f3b9a4b7064787513fd2c455f0001f6e18
[2]:
https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org

Resolves neondatabase/cloud#20869
---
 compute_tools/src/catalog.rs                |  39 ++++-
 compute_tools/src/compute.rs                | 153 +++++++++++---------
 compute_tools/src/http/api.rs               |   7 +-
 compute_tools/src/installed_extensions.rs   | 105 +++++---------
 compute_tools/src/pg_helpers.rs             |  11 ++
 test_runner/fixtures/endpoint/http.py       |   6 +-
 test_runner/fixtures/neon_fixtures.py       |  29 ++++
 test_runner/regress/test_compute_catalog.py | 111 +++++++++++++-
 8 files changed, 318 insertions(+), 143 deletions(-)

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 2f6f82dd39..08ae8bf44d 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,4 +1,3 @@
-use compute_api::responses::CatalogObjects;
 use futures::Stream;
 use postgres::NoTls;
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
@@ -13,7 +12,8 @@ use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 
 use crate::compute::ComputeNode;
-use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async};
+use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db};
+use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
     let connstr = compute.connstr.clone();
@@ -43,6 +43,8 @@ pub enum SchemaDumpError {
     DatabaseDoesNotExist,
     #[error("Failed to execute pg_dump.")]
     IO(#[from] std::io::Error),
+    #[error("Unexpected error.")]
+    Unexpected,
 }
 
 // It uses the pg_dump utility to dump the schema of the specified database.
@@ -60,11 +62,38 @@ pub async fn get_database_schema(
     let pgbin = &compute.pgbin;
     let basepath = Path::new(pgbin).parent().unwrap();
     let pgdump = basepath.join("pg_dump");
-    let mut connstr = compute.connstr.clone();
-    connstr.set_path(dbname);
+
+    // Replace the DB in the connection string and disable it to parts.
+    // This is the only option to handle DBs with special characters.
+    let conf =
+        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let host = conf
+        .get_hosts()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let host = match host {
+        tokio_postgres::config::Host::Tcp(ip) => ip.to_string(),
+        #[cfg(unix)]
+        tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(),
+    };
+    let port = conf
+        .get_ports()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?;
+    let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?;
+
     let mut cmd = Command::new(pgdump)
+        // XXX: this seems to be the only option to deal with DBs with `=` in the name
+        // See <https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org>
+        .env("PGDATABASE", dbname)
+        .arg("--host")
+        .arg(host)
+        .arg("--port")
+        .arg(port.to_string())
+        .arg("--username")
+        .arg(user)
         .arg("--schema-only")
-        .arg(connstr.as_str())
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
         .kill_on_drop(true)
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4f67425ba8..1a026a4014 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,9 +34,8 @@ use utils::measured_stream::MeasuredReader;
 use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 use tokio::spawn;
-use url::Url;
 
-use crate::installed_extensions::get_installed_extensions_sync;
+use crate::installed_extensions::get_installed_extensions;
 use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -816,30 +815,32 @@ impl ComputeNode {
         Ok(())
     }
 
-    async fn get_maintenance_client(url: &Url) -> Result<tokio_postgres::Client> {
-        let mut connstr = url.clone();
+    async fn get_maintenance_client(
+        conf: &tokio_postgres::Config,
+    ) -> Result<tokio_postgres::Client> {
+        let mut conf = conf.clone();
 
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
+        conf.application_name("apply_config");
 
-        let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await {
+        let (client, conn) = match conf.connect(NoTls).await {
+            // If connection fails, it may be the old node with `zenith_admin` superuser.
+            //
+            // In this case we need to connect with old `zenith_admin` name
+            // and create new user. We cannot simply rename connected user,
+            // but we can create a new one and grant it all privileges.
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
-                    // connect with zenith_admin if cloud_admin could not authenticate
+                    // Connect with zenith_admin if cloud_admin could not authenticate
                     info!(
                         "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                         e
                     );
-                    let mut zenith_admin_connstr = connstr.clone();
-
-                    zenith_admin_connstr
-                        .set_username("zenith_admin")
-                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
-                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                        zenith_admin_conf.connect(NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
 
                     // Disable forwarding so that users don't get a cloud_admin role
@@ -853,8 +854,8 @@ impl ComputeNode {
 
                     drop(client);
 
-                    // reconnect with connstring with expected name
-                    tokio_postgres::connect(connstr.as_str(), NoTls).await?
+                    // Reconnect with connstring with expected name
+                    conf.connect(NoTls).await?
                 }
                 _ => return Err(e.into()),
             },
@@ -885,7 +886,7 @@ impl ComputeNode {
     pub fn apply_spec_sql(
         &self,
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         concurrency: usize,
     ) -> Result<()> {
         let rt = tokio::runtime::Builder::new_multi_thread()
@@ -897,7 +898,7 @@ impl ComputeNode {
 
         rt.block_on(async {
             // Proceed with post-startup configuration. Note, that order of operations is important.
-            let client = Self::get_maintenance_client(&url).await?;
+            let client = Self::get_maintenance_client(&conf).await?;
             let spec = spec.clone();
 
             let databases = get_existing_dbs_async(&client).await?;
@@ -931,7 +932,7 @@ impl ComputeNode {
                 RenameAndDeleteDatabases,
                 CreateAndAlterDatabases,
             ] {
-                debug!("Applying phase {:?}", &phase);
+                info!("Applying phase {:?}", &phase);
                 apply_operations(
                     spec.clone(),
                     ctx.clone(),
@@ -942,6 +943,7 @@ impl ComputeNode {
                 .await?;
             }
 
+            info!("Applying RunInEachDatabase phase");
             let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
 
             let db_processes = spec
@@ -955,7 +957,7 @@ impl ComputeNode {
                     let spec = spec.clone();
                     let ctx = ctx.clone();
                     let jwks_roles = jwks_roles.clone();
-                    let mut url = url.as_ref().clone();
+                    let mut conf = conf.as_ref().clone();
                     let concurrency_token = concurrency_token.clone();
                     let db = db.clone();
 
@@ -964,14 +966,14 @@ impl ComputeNode {
                     match &db {
                         DB::SystemDB => {}
                         DB::UserDB(db) => {
-                            url.set_path(db.name.as_str());
+                            conf.dbname(db.name.as_str());
                         }
                     }
 
-                    let url = Arc::new(url);
+                    let conf = Arc::new(conf);
                     let fut = Self::apply_spec_sql_db(
                         spec.clone(),
-                        url,
+                        conf,
                         ctx.clone(),
                         jwks_roles.clone(),
                         concurrency_token.clone(),
@@ -1017,7 +1019,7 @@ impl ComputeNode {
     /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
     async fn apply_spec_sql_db(
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
         jwks_roles: Arc<HashSet<String>>,
         concurrency_token: Arc<tokio::sync::Semaphore>,
@@ -1046,7 +1048,7 @@ impl ComputeNode {
                 // that database.
                 || async {
                     if client_conn.is_none() {
-                        let db_client = Self::get_maintenance_client(&url).await?;
+                        let db_client = Self::get_maintenance_client(&conf).await?;
                         client_conn.replace(db_client);
                     }
                     let client = client_conn.as_ref().unwrap();
@@ -1061,34 +1063,16 @@ impl ComputeNode {
         Ok::<(), anyhow::Error>(())
     }
 
-    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
-    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        // If connection fails,
-        // it may be the old node with `zenith_admin` superuser.
-        //
-        // In this case we need to connect with old `zenith_admin` name
-        // and create new user. We cannot simply rename connected user,
-        // but we can create a new one and grant it all privileges.
-        let mut url = self.connstr.clone();
-        url.query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
-        let url = Arc::new(url);
-        let spec = Arc::new(
-            compute_state
-                .pspec
-                .as_ref()
-                .expect("spec must be set")
-                .spec
-                .clone(),
-        );
-
-        // Choose how many concurrent connections to use for applying the spec changes.
-        // If the cluster is not currently Running we don't have to deal with user connections,
+    /// Choose how many concurrent connections to use for applying the spec changes.
+    pub fn max_service_connections(
+        &self,
+        compute_state: &ComputeState,
+        spec: &ComputeSpec,
+    ) -> usize {
+        // If the cluster is in Init state we don't have to deal with user connections,
         // and can thus use all `max_connections` connection slots. However, that's generally not
         // very efficient, so we generally still limit it to a smaller number.
-        let max_concurrent_connections = if compute_state.status != ComputeStatus::Running {
+        if compute_state.status == ComputeStatus::Init {
             // If the settings contain 'max_connections', use that as template
             if let Some(config) = spec.cluster.settings.find("max_connections") {
                 config.parse::<usize>().ok()
@@ -1144,10 +1128,29 @@ impl ComputeNode {
                 .map(|val| if val > 1 { val - 1 } else { 1 })
                 .last()
                 .unwrap_or(3)
-        };
+        }
+    }
+
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip_all)]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
+        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+        conf.application_name("apply_config");
+
+        let conf = Arc::new(conf);
+        let spec = Arc::new(
+            compute_state
+                .pspec
+                .as_ref()
+                .expect("spec must be set")
+                .spec
+                .clone(),
+        );
+
+        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
 
         // Merge-apply spec & changes to PostgreSQL state.
-        self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?;
+        self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
 
         if let Some(ref local_proxy) = &spec.clone().local_proxy_config {
             info!("configuring local_proxy");
@@ -1156,12 +1159,11 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
-            let mut connstr = url.as_ref().clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
+            let conf = conf.as_ref().clone();
+            let mut conf = postgres::config::Config::from(conf);
+            conf.application_name("migrations");
 
-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
 
@@ -1222,21 +1224,28 @@ impl ComputeNode {
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
-        // temporarily reset max_cluster_size in config
+
+        // TODO(ololobus): We need a concurrency during reconfiguration as well,
+        // but DB is already running and used by user. We can easily get out of
+        // `max_connections` limit, and the current code won't handle that.
+        // let compute_state = self.state.lock().unwrap().clone();
+        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
+        let max_concurrent_connections = 1;
+
+        // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc...
+        // creating new extensions, roles, etc.
         config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
             self.pg_reload_conf()?;
 
             if spec.mode == ComputeMode::Primary {
-                let mut url = self.connstr.clone();
-                url.query_pairs_mut()
-                    .append_pair("application_name", "apply_config");
-                let url = Arc::new(url);
+                let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                conf.application_name("apply_config");
+                let conf = Arc::new(conf);
 
                 let spec = Arc::new(spec.clone());
 
-                self.apply_spec_sql(spec, url, 1)?;
+                self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
             }
 
             Ok(())
@@ -1362,7 +1371,17 @@ impl ComputeNode {
 
             let connstr = self.connstr.clone();
             thread::spawn(move || {
-                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+                let res = get_installed_extensions(&connstr);
+                match res {
+                    Ok(extensions) => {
+                        info!(
+                            "[NEON_EXT_STAT] {}",
+                            serde_json::to_string(&extensions)
+                                .expect("failed to serialize extensions list")
+                        );
+                    }
+                    Err(err) => error!("could not get installed extensions: {err:?}"),
+                }
             });
         }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 8a047634df..a6c6cff20a 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -296,7 +296,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
 
             let connstr = compute.connstr.clone();
-            let res = crate::installed_extensions::get_installed_extensions(connstr).await;
+            let res = task::spawn_blocking(move || {
+                installed_extensions::get_installed_extensions(&connstr)
+            })
+            .await
+            .unwrap();
+
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
                 Err(e) => render_json_error(
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 79d8b2ca04..f473c29a55 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,17 +2,16 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
-use tracing::info;
-use url::Url;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
-use tokio::task;
 
 use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
+use crate::pg_helpers::postgres_conf_for_db;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -42,75 +41,51 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
-    let mut connstr = connstr.clone();
+pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
+    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+    let databases: Vec<String> = list_dbs(&mut client)?;
 
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let databases: Vec<String> = list_dbs(&mut client)?;
+    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    for db in databases.iter() {
+        let config = postgres_conf_for_db(connstr, db)?;
+        let mut db_client = config.connect(NoTls)?;
+        let extensions: Vec<(String, String)> = db_client
+            .query(
+                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                &[],
+            )?
+            .iter()
+            .map(|row| (row.get("extname"), row.get("extversion")))
+            .collect();
 
-        let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
-        for db in databases.iter() {
-            connstr.set_path(db);
-            let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
-            let extensions: Vec<(String, String)> = db_client
-                .query(
-                    "SELECT extname, extversion FROM pg_catalog.pg_extension;",
-                    &[],
-                )?
-                .iter()
-                .map(|row| (row.get("extname"), row.get("extversion")))
-                .collect();
+        for (extname, v) in extensions.iter() {
+            let version = v.to_string();
 
-            for (extname, v) in extensions.iter() {
-                let version = v.to_string();
+            // increment the number of databases where the version of extension is installed
+            INSTALLED_EXTENSIONS
+                .with_label_values(&[extname, &version])
+                .inc();
 
-                // increment the number of databases where the version of extension is installed
-                INSTALLED_EXTENSIONS
-                    .with_label_values(&[extname, &version])
-                    .inc();
-
-                extensions_map
-                    .entry(extname.to_string())
-                    .and_modify(|e| {
-                        e.versions.insert(version.clone());
-                        // count the number of databases where the extension is installed
-                        e.n_databases += 1;
-                    })
-                    .or_insert(InstalledExtension {
-                        extname: extname.to_string(),
-                        versions: HashSet::from([version.clone()]),
-                        n_databases: 1,
-                    });
-            }
+            extensions_map
+                .entry(extname.to_string())
+                .and_modify(|e| {
+                    e.versions.insert(version.clone());
+                    // count the number of databases where the extension is installed
+                    e.n_databases += 1;
+                })
+                .or_insert(InstalledExtension {
+                    extname: extname.to_string(),
+                    versions: HashSet::from([version.clone()]),
+                    n_databases: 1,
+                });
         }
+    }
 
-        let res = InstalledExtensions {
-            extensions: extensions_map.values().cloned().collect(),
-        };
+    let res = InstalledExtensions {
+        extensions: extensions_map.values().cloned().collect(),
+    };
 
-        Ok(res)
-    })
-    .await?
-}
-
-// Gather info about installed extensions
-pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .expect("failed to create runtime");
-    let result = rt
-        .block_on(crate::installed_extensions::get_installed_extensions(
-            connstr,
-        ))
-        .expect("failed to get installed extensions");
-
-    info!(
-        "[NEON_EXT_STAT] {}",
-        serde_json::to_string(&result).expect("failed to serialize extensions list")
-    );
-    Ok(())
+    Ok(res)
 }
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 4a1e5ee0e8..e03b410699 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,6 +6,7 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
+use std::str::FromStr;
 use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
@@ -13,8 +14,10 @@ use anyhow::{bail, Result};
 use futures::StreamExt;
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
+use postgres::config::Config;
 use tokio::io::AsyncBufReadExt;
 use tokio::time::timeout;
+use tokio_postgres;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};
 
@@ -542,3 +545,11 @@ async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Resu
 
     Ok(())
 }
+
+/// `Postgres::config::Config` handles database names with whitespaces
+/// and special characters properly.
+pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result<Config> {
+    let mut conf = Config::from_str(connstr.as_str())?;
+    conf.dbname(dbname);
+    Ok(conf)
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index db3723b7cc..1cd9158c68 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import urllib.parse
+
 import requests
 from requests.adapters import HTTPAdapter
 
@@ -20,7 +22,9 @@ class EndpointHttpClient(requests.Session):
         return res.json()
 
     def database_schema(self, database: str):
-        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res = self.get(
+            f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}"
+        )
         res.raise_for_status()
         return res.text
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a45a311dc2..1f4d2aa5ec 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3934,6 +3934,35 @@ class Endpoint(PgProtocol, LogUtils):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
+    def respec_deep(self, **kwargs: Any) -> None:
+        """
+        Update the endpoint.json file taking into account nested keys.
+        It does one level deep update. Should enough for most cases.
+        Distinct method from respec() to do not break existing functionality.
+        NOTE: This method also updates the spec.json file, not endpoint.json.
+        We need it because neon_local also writes to spec.json, so intended
+        use-case is i) start endpoint with some config, ii) respec_deep(),
+        iii) call reconfigure() to apply the changes.
+        """
+        config_path = os.path.join(self.endpoint_path(), "spec.json")
+        with open(config_path) as f:
+            data_dict: dict[str, Any] = json.load(f)
+
+        log.info("Current compute spec: %s", json.dumps(data_dict, indent=4))
+
+        for key, value in kwargs.items():
+            if isinstance(value, dict):
+                if key not in data_dict:
+                    data_dict[key] = value
+                else:
+                    data_dict[key] = {**data_dict[key], **value}
+            else:
+                data_dict[key] = value
+
+        with open(config_path, "w") as file:
+            log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
+            json.dump(data_dict, file, indent=4)
+
     # Please note: Migrations only run if pg_skip_catalog_updates is false
     def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index d43c71ceac..b3719a45ed 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -3,13 +3,60 @@ from __future__ import annotations
 import requests
 from fixtures.neon_fixtures import NeonEnv
 
+TEST_DB_NAMES = [
+    {
+        "name": "neondb",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with spaces",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with%20spaces ",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with whitespaces	",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "injective db with spaces'; SELECT pg_sleep(10);",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with #pound-sign and &ampersands=true",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with emoji 🌍",
+        "owner": "cloud_admin",
+    },
+]
+
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
+    """
+    Create a bunch of databases with tricky names and test that we can list them
+    and dump via API.
+    """
     env = neon_simple_env
 
-    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
-    client = endpoint.http_client()
+    endpoint = env.endpoints.create_start("main")
 
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to create some test databases.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    client = endpoint.http_client()
     objects = client.dbs_and_roles()
 
     # Assert that 'cloud_admin' role exists in the 'roles' list
@@ -22,9 +69,24 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         db["name"] == "postgres" for db in objects["databases"]
     ), "The 'postgres' database is missing"
 
-    ddl = client.database_schema(database="postgres")
+    # Check other databases
+    for test_db in TEST_DB_NAMES:
+        db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None)
+        assert db is not None, f"The '{test_db['name']}' database is missing"
+        assert (
+            db["owner"] == test_db["owner"]
+        ), f"The '{test_db['name']}' database has incorrect owner"
 
-    assert "-- PostgreSQL database dump" in ddl
+        ddl = client.database_schema(database=test_db["name"])
+
+        # Check that it looks like a valid PostgreSQL dump
+        assert "-- PostgreSQL database dump" in ddl
+
+        # Check that it doesn't contain health_check and migration traces.
+        # They are only created in system `postgres` database, so by checking
+        # that we ensure that we dump right databases.
+        assert "health_check" not in ddl, f"The '{test_db['name']}' database contains health_check"
+        assert "migration" not in ddl, f"The '{test_db['name']}' database contains migrations data"
 
     try:
         client.database_schema(database="nonexistentdb")
@@ -33,3 +95,44 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         assert (
             e.response.status_code == 404
         ), f"Expected 404 status code, but got {e.response.status_code}"
+
+
+def test_compute_create_databases(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can create and work with databases with special
+    characters (whitespaces, %, tabs, etc.) in the name.
+    """
+    env = neon_simple_env
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start("main")
+
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    for db in TEST_DB_NAMES:
+        # Check that database has a correct name in the system catalog
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],))
+            catalog_db = cursor.fetchone()
+            assert catalog_db is not None
+            assert len(catalog_db) == 1
+            assert catalog_db[0] == db["name"]
+
+        # Check that we can connect to this database without any issues
+        with endpoint.cursor(dbname=db["name"]) as cursor:
+            cursor.execute("SELECT * FROM current_database()")
+            curr_db = cursor.fetchone()
+            assert curr_db is not None
+            assert len(curr_db) == 1
+            assert curr_db[0] == db["name"]