one more segfault in handling error messages

tried patch locally
include patches in repo
2026-02-10 14:10:37 +00:00 · 2025-02-06 17:43:00 +01:00 · 2025-02-06 13:52:02 +01:00 · 2025-02-06 13:37:00 +01:00 · 2025-02-06 11:27:29 +01:00 · 2025-02-06 10:09:20 +00:00
65 changed files with 3043 additions and 1941 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,3 +24,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
+!build_tools/patches
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -682,7 +682,7 @@ jobs:
          push: true
          pull: true
          file: compute/compute-node.Dockerfile
-          target: neon-pg-ext-test
+          target: extension-tests
          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -206,6 +206,16 @@ dependencies = [
 "syn 2.0.90",
 ]

+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -1010,6 +1020,12 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "boxcar"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
+
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -2433,6 +2449,16 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "gettid"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397256552fed4a9e577850498071831ec8f18ea83368aecc114cab469dcb43e5"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
 [[package]]
 name = "gimli"
 version = "0.31.1"
@@ -4212,6 +4238,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "papaya"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+dependencies = [
+ "equivalent",
+ "seize",
+]
+
 [[package]]
 name = "parking"
 version = "2.1.1"
@@ -4839,6 +4875,7 @@ dependencies = [
 "ahash",
 "anyhow",
 "arc-swap",
+ "assert-json-diff",
 "async-compression",
 "async-trait",
 "atomic-take",
@@ -4846,6 +4883,7 @@ dependencies = [
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.13.1",
+ "boxcar",
 "bstr",
 "bytes",
 "camino",
@@ -4862,6 +4900,7 @@ dependencies = [
 "flate2",
 "framed-websockets",
 "futures",
+ "gettid",
 "hashbrown 0.14.5",
 "hashlink",
 "hex",
@@ -4884,7 +4923,9 @@ dependencies = [
 "measured",
 "metrics",
 "once_cell",
+ "opentelemetry",
 "p256 0.13.2",
+ "papaya",
 "parking_lot 0.12.1",
 "parquet",
 "parquet_derive",
@@ -4931,6 +4972,9 @@ dependencies = [
 "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
+ "tracing-log",
+ "tracing-opentelemetry",
+ "tracing-serde",
 "tracing-subscriber",
 "tracing-utils",
 "try-lock",
@@ -5884,6 +5928,16 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "seize"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "semver"
 version = "1.0.17"
@@ -8145,6 +8199,7 @@ dependencies = [
 "tower 0.4.13",
 "tracing",
 "tracing-core",
+ "tracing-log",
 "url",
 "zerocopy",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 backtrace = "0.3.74"
 flate2 = "1.0.26"
+assert-json-diff = "2"
 async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
@@ -193,7 +194,9 @@ tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
+tracing-log = "0.2"
 tracing-opentelemetry = "0.28"
+tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,10 +3,17 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION

+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
+# By default, /bin/sh used in debian images will treat '\n' as eol,
+# but as we use bash as SHELL, and built-in echo in bash requires '-e' flag for that.
 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

+COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
        apt update && \
@@ -39,6 +46,7 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        mkdir /tmp/pgcopydb && \
        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
        cd /tmp/pgcopydb && \
+        patch -p1 < /pgcopydbv017.patch && \
        make -s clean && \
        make -s -j12 install && \
        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
@@ -55,7 +63,8 @@ ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

 RUN mkdir -p /pgcopydb/bin && \
    mkdir -p /pgcopydb/lib && \
@@ -66,7 +75,7 @@ COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/p
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5

 RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
-    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc && \
    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc

 # System deps
@@ -190,8 +199,14 @@ RUN set -e \
 # It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
 # And patches from us:
 # - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+RUN set +o pipefail && \
+	 for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do \
+		yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')";\
+	 done && \
+	set -o pipefail
+# Split into separate step to debug flaky failures here
+RUN wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && ls -laht lcov.tar.gz && sha256sum lcov.tar.gz \
    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
    && cd lcov \
--- a/build_tools/patches/pgcopydbv017.patch
+++ b/build_tools/patches/pgcopydbv017.patch
@@ -0,0 +1,57 @@
+diff --git a/src/bin/pgcopydb/copydb.c b/src/bin/pgcopydb/copydb.c
+index d730b03..69a9be9 100644
+--- a/src/bin/pgcopydb/copydb.c
+++ b/src/bin/pgcopydb/copydb.c
+@@ -44,6 +44,7 @@ GUC dstSettings[] = {
+ 	{ "synchronous_commit", "'off'" },
+ 	{ "statement_timeout", "0" },
+ 	{ "lock_timeout", "0" },
+	{ "idle_in_transaction_session_timeout", "0" },
+ 	{ NULL, NULL },
+ };
+ 
+diff --git a/src/bin/pgcopydb/pgsql.c b/src/bin/pgcopydb/pgsql.c
+index 94f2f46..e051ba8 100644
+--- a/src/bin/pgcopydb/pgsql.c
+++ b/src/bin/pgcopydb/pgsql.c
+@@ -2319,6 +2319,11 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 
+ 	LinesBuffer lbuf = { 0 };
+ 
+	if (message != NULL){
+		// make sure message is writable by splitLines
+		message = strdup(message);
+	}
+
+ 	if (!splitLines(&lbuf, message))
+ 	{
+ 		/* errors have already been logged */
+@@ -2332,6 +2337,7 @@ pgsql_execute_log_error(PGSQL *pgsql,
+ 				  PQbackendPID(pgsql->connection),
+ 				  lbuf.lines[lineNumber]);
+ 	}
+        free(message); // free copy of message we created above
+ 
+ 	if (pgsql->logSQL)
+ 	{
+@@ -3174,11 +3180,18 @@ pgcopy_log_error(PGSQL *pgsql, PGresult *res, const char *context)
+ 		/* errors have already been logged */
+ 		return;
+ 	}
+-
+ 	if (res != NULL)
+ 	{
+ 		char *sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+-		strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		if (sqlstate == NULL)
+		{
+			// PQresultErrorField returned NULL!
+			pgsql->sqlstate[0] = '\0';  // Set to an empty string to avoid segfault
+		}
+		else
+		{
+			strlcpy(pgsql->sqlstate, sqlstate, sizeof(pgsql->sqlstate));
+		}
+ 	}
+ 
+ 	char *endpoint =
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
--- a/deny.toml
+++ b/deny.toml
@@ -32,6 +32,7 @@ reason = "the marvin attack only affects private key decryption, not public key
 # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 [licenses]
 allow = [
+    "0BSD",
    "Apache-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -52,6 +52,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do

    if [ $pg_version -ge 16 ]; then
        docker cp ext-src $TEST_CONTAINER_NAME:/
+        docker exec $TEST_CONTAINER_NAME bash -c "apt update && apt install -y libtap-parser-sourcehandler-pgtap-perl"
        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
--- a/docker-compose/ext-src/pgjwt-src/neon-test.sh
+++ b/docker-compose/ext-src/pgjwt-src/neon-test.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+pg_prove test.sql
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.patch
@@ -0,0 +1,15 @@
+diff --git a/test.sql b/test.sql
+index d7a0ca8..f15bc76 100644
+--- a/test.sql
+++ b/test.sql
+@@ -9,9 +9,7 @@
+ \set ON_ERROR_STOP true
+ \set QUIET 1
+ 
+-CREATE EXTENSION pgcrypto;
+-CREATE EXTENSION pgtap;
+-CREATE EXTENSION pgjwt;
+CREATE EXTENSION IF NOT EXISTS pgtap;
+ 
+ BEGIN;
+ SELECT plan(23);
--- a/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgjwt-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+pg_prove test.sql
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -24,7 +24,7 @@ function wait_for_ready {
 }
 function create_extensions() {
  for ext in ${1}; do
-    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext}"
+    docker compose exec neon-test-extensions psql -X -v ON_ERROR_STOP=1 -d contrib_regression -c "CREATE EXTENSION IF NOT EXISTS ${ext} CASCADE"
  done
 }
 EXTENSIONS='[
@@ -40,7 +40,8 @@ EXTENSIONS='[
 {"extname": "pg_uuidv7", "extdir": "pg_uuidv7-src"},
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
-{"extname": "pg_ivm", "extdir": "pg_ivm-src"}
+{"extname": "pg_ivm", "extdir": "pg_ivm-src"},
+{"extname": "pgjwt", "extdir": "pgjwt-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -204,14 +204,16 @@ impl RemoteExtSpec {

        // Check if extension is present in public or custom.
        // If not, then it is not allowed to be used by this compute.
-        if let Some(public_extensions) = &self.public_extensions {
-            if !public_extensions.contains(&real_ext_name.to_string()) {
-                if let Some(custom_extensions) = &self.custom_extensions {
-                    if !custom_extensions.contains(&real_ext_name.to_string()) {
-                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
-                    }
-                }
-            }
+        if !self
+            .public_extensions
+            .as_ref()
+            .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+            && !self
+                .custom_extensions
+                .as_ref()
+                .is_some_and(|exts| exts.iter().any(|e| e == ext_name))
+        {
+            return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
        }

        match self.extension_data.get(real_ext_name) {
@@ -340,6 +342,96 @@ mod tests {
    use super::*;
    use std::fs::File;

+    #[test]
+    fn allow_installing_remote_extensions() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": null,
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": null,
+            "library_index": {},
+            "extension_data": {},
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect_err("Extension should not be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": [],
+            "custom_extensions": ["ext"],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "ext": "ext"
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+    }
+
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};

 use crate::{
    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
-    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
 };

 /// External backup storage configuration, enough for creating a client for that storage.
@@ -45,11 +45,11 @@ impl RemoteStorageKind {

 impl RemoteStorageConfig {
    /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
+    pub fn concurrency_limit(&self) -> usize {
        match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::LocalFs { .. } => DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT,
+            RemoteStorageKind::AwsS3(c) => c.concurrency_limit.into(),
+            RemoteStorageKind::AzureContainer(c) => c.concurrency_limit.into(),
        }
    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,12 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
 pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+/// Set this limit analogously to the S3 limit.
+///
+/// The local filesystem backend doesn't enforce a concurrency limit itself, but this also bounds
+/// the upload queue concurrency. Some tests create thousands of uploads, which slows down the
+/// quadratic scheduling of the upload queue, and there is no point spawning so many Tokio tasks.
+pub const DEFAULT_REMOTE_STORAGE_LOCALFS_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -38,12 +38,14 @@ impl Display for SafekeeperId {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(transparent)]
 pub struct MemberSet {
-    pub m: Vec<SafekeeperId>,
+    pub members: Vec<SafekeeperId>,
 }

 impl MemberSet {
    pub fn empty() -> Self {
-        MemberSet { m: Vec::new() }
+        MemberSet {
+            members: Vec::new(),
+        }
    }

    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
@@ -51,11 +53,11 @@ impl MemberSet {
        if hs.len() != members.len() {
            bail!("duplicate safekeeper id in the set {:?}", members);
        }
-        Ok(MemberSet { m: members })
+        Ok(MemberSet { members })
    }

    pub fn contains(&self, sk: &SafekeeperId) -> bool {
-        self.m.iter().any(|m| m.id == sk.id)
+        self.members.iter().any(|m| m.id == sk.id)
    }

    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
@@ -65,7 +67,7 @@ impl MemberSet {
                sk.id, self
            ));
        }
-        self.m.push(sk);
+        self.members.push(sk);
        Ok(())
    }
 }
@@ -73,7 +75,11 @@ impl MemberSet {
 impl Display for MemberSet {
    /// Display as a comma separated list of members.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
+        let sks_str = self
+            .members
+            .iter()
+            .map(|m| m.to_string())
+            .collect::<Vec<_>>();
        write!(f, "({})", sks_str.join(", "))
    }
 }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -215,7 +215,6 @@ impl Wrapper {
            syncSafekeepers: config.sync_safekeepers,
            systemId: 0,
            pgTimeline: 1,
-            proto_version: 3,
            callback_data,
        };
        let c_config = Box::into_raw(Box::new(c_config));
@@ -277,7 +276,6 @@ mod tests {
    use core::panic;
    use std::{
        cell::Cell,
-        ffi::CString,
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
    };

@@ -498,64 +496,57 @@ mod tests {
        // Messages definitions are at walproposer.h
        // xxx: it would be better to extract them from safekeeper crate and
        // use serialization/deserialization here.
-        let greeting_tag = (b'g').to_be_bytes();
-        let tenant_id = CString::new(ttid.tenant_id.to_string())
-            .unwrap()
-            .into_bytes_with_nul();
-        let timeline_id = CString::new(ttid.timeline_id.to_string())
-            .unwrap()
-            .into_bytes_with_nul();
-        let mconf_gen = 0_u32.to_be_bytes();
-        let mconf_members_len = 0_u32.to_be_bytes();
-        let mconf_members_new_len = 0_u32.to_be_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes();
-        let system_id = 0_u64.to_be_bytes();
-        let wal_seg_size = 16777216_u32.to_be_bytes();
-
+        let greeting_tag = (b'g' as u64).to_ne_bytes();
+        let proto_version = 2_u32.to_ne_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
+        let proposer_id = [0; 16];
+        let system_id = 0_u64.to_ne_bytes();
+        let tenant_id = ttid.tenant_id.as_arr();
+        let timeline_id = ttid.timeline_id.as_arr();
+        let pg_tli = 1_u32.to_ne_bytes();
+        let wal_seg_size = 16777216_u32.to_ne_bytes();
        let proposer_greeting = [
            greeting_tag.as_slice(),
+            proto_version.as_slice(),
+            pg_version.as_slice(),
+            proposer_id.as_slice(),
+            system_id.as_slice(),
            tenant_id.as_slice(),
            timeline_id.as_slice(),
-            mconf_gen.as_slice(),
-            mconf_members_len.as_slice(),
-            mconf_members_new_len.as_slice(),
-            pg_version.as_slice(),
-            system_id.as_slice(),
+            pg_tli.as_slice(),
            wal_seg_size.as_slice(),
        ]
        .concat();

-        let voting_tag = (b'v').to_be_bytes();
-        let vote_request_term = 3_u64.to_be_bytes();
+        let voting_tag = (b'v' as u64).to_ne_bytes();
+        let vote_request_term = 3_u64.to_ne_bytes();
+        let proposer_id = [0; 16];
        let vote_request = [
            voting_tag.as_slice(),
-            mconf_gen.as_slice(),
            vote_request_term.as_slice(),
+            proposer_id.as_slice(),
        ]
        .concat();

-        let acceptor_greeting_term = 2_u64.to_be_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_be_bytes();
+        let acceptor_greeting_term = 2_u64.to_ne_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
        let acceptor_greeting = [
            greeting_tag.as_slice(),
-            acceptor_greeting_node_id.as_slice(),
-            mconf_gen.as_slice(),
-            mconf_members_len.as_slice(),
-            mconf_members_new_len.as_slice(),
            acceptor_greeting_term.as_slice(),
+            acceptor_greeting_node_id.as_slice(),
        ]
        .concat();

-        let vote_response_term = 3_u64.to_be_bytes();
-        let vote_given = 1_u8.to_be_bytes();
-        let flush_lsn = 0x539_u64.to_be_bytes();
-        let truncate_lsn = 0x539_u64.to_be_bytes();
-        let th_len = 1_u32.to_be_bytes();
-        let th_term = 2_u64.to_be_bytes();
-        let th_lsn = 0x539_u64.to_be_bytes();
+        let vote_response_term = 3_u64.to_ne_bytes();
+        let vote_given = 1_u64.to_ne_bytes();
+        let flush_lsn = 0x539_u64.to_ne_bytes();
+        let truncate_lsn = 0x539_u64.to_ne_bytes();
+        let th_len = 1_u32.to_ne_bytes();
+        let th_term = 2_u64.to_ne_bytes();
+        let th_lsn = 0x539_u64.to_ne_bytes();
+        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
        let vote_response = [
            voting_tag.as_slice(),
-            mconf_gen.as_slice(),
            vote_response_term.as_slice(),
            vote_given.as_slice(),
            flush_lsn.as_slice(),
@@ -563,6 +554,7 @@ mod tests {
            th_len.as_slice(),
            th_term.as_slice(),
            th_lsn.as_slice(),
+            timeline_start_lsn.as_slice(),
        ]
        .concat();

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -32,6 +32,7 @@ use utils::id::TimelineId;

 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
+use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
@@ -2378,11 +2379,40 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) values_committed_metadata_images: IntCounter,
+    pub(crate) values_committed_metadata_deltas: IntCounter,
+    pub(crate) values_committed_data_images: IntCounter,
+    pub(crate) values_committed_data_deltas: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
-    pub(crate) clear_vm_bits_unknown: IntCounterVec,
+}
+
+impl WalIngestMetrics {
+    pub(crate) fn inc_values_committed(&self, stats: &DatadirModificationStats) {
+        if stats.metadata_images > 0 {
+            self.values_committed_metadata_images
+                .inc_by(stats.metadata_images);
+        }
+        if stats.metadata_deltas > 0 {
+            self.values_committed_metadata_deltas
+                .inc_by(stats.metadata_deltas);
+        }
+        if stats.data_images > 0 {
+            self.values_committed_data_images.inc_by(stats.data_images);
+        }
+        if stats.data_deltas > 0 {
+            self.values_committed_data_deltas.inc_by(stats.data_deltas);
+        }
+    }
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
+    let values_committed = register_int_counter_vec!(
+        "pageserver_wal_ingest_values_committed",
+        "Number of values committed to pageserver storage from WAL records",
+        &["class", "kind"],
+    )
+    .expect("failed to define a metric");
+
    WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
@@ -2409,17 +2439,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
+    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
+    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
+    values_committed_data_deltas: values_committed.with_label_values(&["data", "delta"]),
    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
        "pageserver_gap_blocks_zeroed_on_rel_extend",
        "Total number of zero gap blocks written on relation extends"
    )
    .expect("failed to define a metric"),
-    clear_vm_bits_unknown: register_int_counter_vec!(
-        "pageserver_wal_ingest_clear_vm_bits_unknown",
-        "Number of ignored ClearVmBits operations due to unknown pages/relations",
-        &["entity"],
-    )
-    .expect("failed to define a metric"),
 }
 });

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1280,8 +1280,6 @@ impl PageServerHandler {
                }
                Ok(())
            }
-            // and log the info! line inside the request span
-            .instrument(span.clone())
            .await?;
        }
        Ok(())
@@ -2037,6 +2035,12 @@ impl PageServerHandler {
            .get(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;

+        if timeline.is_archived() == Some(true) {
+            // TODO after a grace period, turn this log line into a hard error
+            tracing::warn!("timeline {tenant_id}/{timeline_id} is archived, but got basebackup request for it.");
+            //return Err(QueryError::NotFound("timeline is archived".into()))
+        }
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -48,7 +48,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -1297,6 +1297,26 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -2317,6 +2337,15 @@ impl DatadirModification<'_> {
    }
 }

+/// Statistics for a DatadirModification.
+#[derive(Default)]
+pub struct DatadirModificationStats {
+    pub metadata_images: u64,
+    pub metadata_deltas: u64,
+    pub data_images: u64,
+    pub data_deltas: u64,
+}
+
 /// This struct facilitates accessing either a committed key from the timeline at a
 /// specific LSN, or the latest uncommitted key from a pending modification.
 ///
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -437,8 +437,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
@@ -461,8 +460,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());
        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
        self.update_remote_physical_size_gauge(None);
@@ -484,8 +482,7 @@ impl RemoteTimelineClient {
            .conf
            .remote_storage_config
            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
+            .map_or(0, |r| r.concurrency_limit());

        let mut upload_queue = self.upload_queue.lock().unwrap();
        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -9,13 +9,14 @@ use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
        config::AttachmentMode,
-        mgr::GetTenantError,
-        mgr::TenantManager,
+        mgr::{GetTenantError, TenantManager},
        remote_timeline_client::remote_heatmap_path,
        span::debug_assert_current_span_has_tenant_id,
        tasks::{warn_when_period_overrun, BackgroundLoopKind},
        Tenant,
    },
+    virtual_file::VirtualFile,
+    TEMP_FILE_SUFFIX,
 };

 use futures::Future;
@@ -32,7 +33,10 @@ use super::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
-use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension,
+    yielding_loop::yielding_loop,
+};

 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
@@ -461,6 +465,18 @@ async fn upload_tenant_heatmap(
        }
    }

+    // After a successful upload persist the fresh heatmap to disk.
+    // When restarting, the tenant will read the heatmap from disk
+    // and additively generate a new heatmap (see [`Timeline::generate_heatmap`]).
+    // If the heatmap is stale, the additive generation can lead to keeping previously
+    // evicted timelines on the secondarie's disk.
+    let tenant_shard_id = tenant.get_tenant_shard_id();
+    let heatmap_path = tenant.conf.tenant_heatmap_path(tenant_shard_id);
+    let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+    if let Err(err) = VirtualFile::crashsafe_overwrite(heatmap_path, temp_path, bytes).await {
+        tracing::warn!("Non fatal IO error writing to disk after heatmap upload: {err}");
+    }
+
    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");

    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -211,7 +211,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
                        sleep_duration = if let CompactionOutcome::Pending = outcome {
-                            Duration::ZERO
+                            Duration::from_secs(1)
                        } else {
                            period
                        };
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -192,7 +192,12 @@ pub enum ImageLayerCreationMode {

 #[derive(Clone, Debug, Default)]
 pub enum LastImageLayerCreationStatus {
-    Incomplete, // TODO: record the last key being processed
+    Incomplete {
+        /// The last key of the partition (exclusive) that was processed in the last
+        /// image layer creation attempt. We will continue from this key in the next
+        /// attempt.
+        last_key: Key,
+    },
    Complete,
    #[default]
    Initial,
@@ -4346,7 +4351,7 @@ impl Timeline {
        Ok(result)
    }

-    // Is it time to create a new image layer for the given partition?
+    // Is it time to create a new image layer for the given partition? True if we want to generate.
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
        let threshold = self.get_image_creation_threshold();

@@ -4658,6 +4663,11 @@ impl Timeline {
    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();

+        if partitioning.parts.is_empty() {
+            warn!("no partitions to create image layers for");
+            return Ok((vec![], LastImageLayerCreationStatus::Complete));
+        }
+
        // We need to avoid holes between generated image layers.
        // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
        // image layer with hole between them. In this case such layer can not be utilized by GC.
@@ -4669,28 +4679,65 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;

-        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
-            info!(
-                "resuming image layer creation: last_status={:?}",
-                last_status
-            );
-            true
-        } else {
-            self.should_check_if_image_layers_required(lsn)
-        };
+        let check_for_image_layers =
+            if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+                info!(
+                    "resuming image layer creation: last_status=incomplete, continue from {}",
+                    last_key
+                );
+                true
+            } else {
+                self.should_check_if_image_layers_required(lsn)
+            };

        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;

        let mut all_generated = true;

        let mut partition_processed = 0;
-        let total_partitions = partitioning.parts.len();
+        let mut total_partitions = partitioning.parts.len();
+        let mut last_partition_processed = None;
+        let mut partition_parts = partitioning.parts.clone();

-        for partition in partitioning.parts.iter() {
+        if let LastImageLayerCreationStatus::Incomplete { last_key } = last_status {
+            // We need to skip the partitions that have already been processed.
+            let mut found = false;
+            for (i, partition) in partition_parts.iter().enumerate() {
+                if last_key <= partition.end().unwrap() {
+                    // ```plain
+                    // |------|--------|----------|------|
+                    //              ^last_key
+                    //                    ^start from this partition
+                    // ```
+                    // Why `i+1` instead of `i`?
+                    // It is possible that the user did some writes after the previous image layer creation attempt so that
+                    // a relation grows in size, and the last_key is now in the middle of the partition. In this case, we
+                    // still want to skip this partition, so that we can make progress and avoid generating image layers over
+                    // the same partition. Doing a mod to ensure we don't end up with an empty vec.
+                    if i + 1 >= total_partitions {
+                        // In general, this case should not happen -- if last_key is on the last partition, the previous
+                        // iteration of image layer creation should return a complete status.
+                        break; // with found=false
+                    }
+                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
+                    total_partitions = partition_parts.len();
+                    // Update the start key to the partition start.
+                    start = partition_parts[0].start().unwrap();
+                    found = true;
+                    break;
+                }
+            }
+            if !found {
+                // Last key is within the last partition, or larger than all partitions.
+                return Ok((vec![], LastImageLayerCreationStatus::Complete));
+            }
+        }
+
+        for partition in partition_parts.iter() {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
            }
-
+            partition_processed += 1;
            let img_range = start..partition.ranges.last().unwrap().end;
            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
            if compact_metadata {
@@ -4725,6 +4772,8 @@ impl Timeline {
                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
                    is_delta: false,
                }) {
+                    // TODO: this can be processed with the BatchLayerWriter::finish_with_discard
+                    // in the future.
                    tracing::info!(
                        "Skipping image layer at {lsn} {}..{}, already exists",
                        img_range.start,
@@ -4805,8 +4854,6 @@ impl Timeline {
                }
            }

-            partition_processed += 1;
-
            if let ImageLayerCreationMode::Try = mode {
                // We have at least made some progress
                if batch_image_writer.pending_layer_num() >= 1 {
@@ -4822,8 +4869,10 @@ impl Timeline {
                        * self.get_compaction_threshold();
                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
                        tracing::info!(
-                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
+                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
                    );
+                        last_partition_processed = Some(partition.clone());
                        all_generated = false;
                        break;
                    }
@@ -4868,7 +4917,14 @@ impl Timeline {
            if all_generated {
                LastImageLayerCreationStatus::Complete
            } else {
-                LastImageLayerCreationStatus::Incomplete
+                LastImageLayerCreationStatus::Incomplete {
+                    last_key: if let Some(last_partition_processed) = last_partition_processed {
+                        last_partition_processed.end().unwrap_or(Key::MIN)
+                    } else {
+                        // This branch should be unreachable, but in case it happens, we can just return the start key.
+                        Key::MIN
+                    },
+                }
            },
        ))
    }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -33,6 +33,7 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -438,6 +439,11 @@ impl KeyHistoryRetention {
        if dry_run {
            return true;
        }
+        if LayerMap::is_l0(&key.key_range, key.is_delta) {
+            // gc-compaction should not produce L0 deltas, otherwise it will break the layer order.
+            // We should ignore such layers.
+            return true;
+        }
        let layer_generation;
        {
            let guard = tline.layers.read().await;
@@ -748,7 +754,7 @@ impl Timeline {
                    .store(Arc::new(outcome.clone()));

                self.upload_new_image_layers(image_layers)?;
-                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
                    // Yield and do not do any other kind of compaction.
                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
                    return Ok(CompactionOutcome::Pending);
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -355,6 +355,19 @@ pub(super) async fn handle_walreceiver_connection(
                // advances it to its end LSN. 0 is just an initialization placeholder.
                let mut modification = timeline.begin_modification(Lsn(0));

+                async fn commit(
+                    modification: &mut DatadirModification<'_>,
+                    ctx: &RequestContext,
+                    uncommitted: &mut u64,
+                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
+                    *uncommitted = 0;
+                    Ok(())
+                }
+
                if !records.is_empty() {
                    timeline
                        .metrics
@@ -366,8 +379,7 @@ pub(super) async fn handle_walreceiver_connection(
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
                    {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                    }

                    let local_next_record_lsn = interpreted.next_record_lsn;
@@ -396,8 +408,7 @@ pub(super) async fn handle_walreceiver_connection(
                        || modification.approx_pending_bytes()
                            > DatadirModification::MAX_PENDING_BYTES
                    {
-                        modification.commit(&ctx).await?;
-                        uncommitted_records = 0;
+                        commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                    }
                }

@@ -415,7 +426,7 @@ pub(super) async fn handle_walreceiver_connection(

                if uncommitted_records > 0 || needs_last_record_lsn_advance {
                    // Commit any uncommitted records
-                    modification.commit(&ctx).await?;
+                    commit(&mut modification, &ctx, &mut uncommitted_records).await?;
                }

                if !caught_up && streaming_lsn >= end_of_wal {
@@ -442,10 +453,12 @@ pub(super) async fn handle_walreceiver_connection(
                    filtered: &mut u64,
                    ctx: &RequestContext,
                ) -> anyhow::Result<()> {
+                    let stats = modification.stats();
+                    modification.commit(ctx).await?;
                    WAL_INGEST
                        .records_committed
                        .inc_by(*uncommitted - *filtered);
-                    modification.commit(ctx).await?;
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    *filtered = 0;
                    Ok(())
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,17 +28,9 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;

-use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::fsm_logical_to_physical;
-use postgres_ffi::walrecord::*;
-use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use wal_decoder::models::*;
-
 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
-use utils::failpoint_support;
-use utils::rate_limit::RateLimit;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -50,11 +42,18 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::walrecord::*;
 use postgres_ffi::TransactionId;
+use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
+use utils::rate_limit::RateLimit;
+use utils::{critical, failpoint_support};
+use wal_decoder::models::*;

 enum_pgversion! {CheckPoint, pgv::CheckPoint}

@@ -327,93 +326,75 @@ impl WalIngest {
        let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
        let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);

-        // Sometimes, Postgres seems to create heap WAL records with the
-        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-        // not set. In fact, it's possible that the VM page does not exist at all.
-        // In that case, we don't want to store a record to clear the VM bit;
-        // replaying it would fail to find the previous image of the page, because
-        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-        // record if it doesn't.
-        //
-        // TODO: analyze the metrics and tighten this up accordingly. This logic
-        // implicitly assumes that VM pages see explicit WAL writes before
-        // implicit ClearVmBits, and will otherwise silently drop updates.
+        // VM bits can only be cleared on the shard(s) owning the VM relation, and must be within
+        // its view of the VM relation size. Out of caution, error instead of failing WAL ingestion,
+        // as there has historically been cases where PostgreSQL has cleared spurious VM pages. See:
+        // https://github.com/neondatabase/neon/pull/10634.
        let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
-            WAL_INGEST
-                .clear_vm_bits_unknown
-                .with_label_values(&["relation"])
-                .inc();
+            critical!("clear_vm_bits for unknown VM relation {vm_rel}");
            return Ok(());
        };
        if let Some(blknum) = new_vm_blk {
            if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["new_page"])
-                    .inc();
+                critical!("new_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                new_vm_blk = None;
            }
        }
        if let Some(blknum) = old_vm_blk {
            if blknum >= vm_size {
-                WAL_INGEST
-                    .clear_vm_bits_unknown
-                    .with_label_values(&["old_page"])
-                    .inc();
+                critical!("old_vm_blk {blknum} not in {vm_rel} of size {vm_size}");
                old_vm_blk = None;
            }
        }

-        if new_vm_blk.is_some() || old_vm_blk.is_some() {
-            if new_vm_blk == old_vm_blk {
-                // An UPDATE record that needs to clear the bits for both old and the
-                // new page, both of which reside on the same VM page.
+        if new_vm_blk.is_none() && old_vm_blk.is_none() {
+            return Ok(());
+        } else if new_vm_blk == old_vm_blk {
+            // An UPDATE record that needs to clear the bits for both old and the new page, both of
+            // which reside on the same VM page.
+            self.put_rel_wal_record(
+                modification,
+                vm_rel,
+                new_vm_blk.unwrap(),
+                NeonWalRecord::ClearVisibilityMapFlags {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    flags,
+                },
+                ctx,
+            )
+            .await?;
+        } else {
+            // Clear VM bits for one heap page, or for two pages that reside on different VM pages.
+            if let Some(new_vm_blk) = new_vm_blk {
                self.put_rel_wal_record(
                    modification,
                    vm_rel,
-                    new_vm_blk.unwrap(),
+                    new_vm_blk,
                    NeonWalRecord::ClearVisibilityMapFlags {
                        new_heap_blkno,
+                        old_heap_blkno: None,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            }
+            if let Some(old_vm_blk) = old_vm_blk {
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    old_vm_blk,
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno: None,
                        old_heap_blkno,
                        flags,
                    },
                    ctx,
                )
                .await?;
-            } else {
-                // Clear VM bits for one heap page, or for two pages that reside on
-                // different VM pages.
-                if let Some(new_vm_blk) = new_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno: None,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
-                if let Some(old_vm_blk) = old_vm_blk {
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        old_vm_blk,
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno: None,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                }
            }
        }
-
        Ok(())
    }

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -509,47 +509,44 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

-	tag.blockNum = (blkno + i) & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	hash = get_hash_value(lfc_hash, &tag);
-	chunk_offs = (blkno + i) & (BLOCKS_PER_CHUNK - 1);
+	chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);

 	LWLockAcquire(lfc_lock, LW_SHARED);

+	if (!LFC_ENABLED())
+	{
+		LWLockRelease(lfc_lock);
+		return 0;
+	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
-		if (LFC_ENABLED())
-		{
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);

-			if (entry != NULL)
+		if (entry != NULL)
+		{
+			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
-				for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
+				if ((entry->bitmap[chunk_offs >> 5] & 
+					 ((uint32)1 << (chunk_offs & 31))) != 0)
 				{
-					if ((entry->bitmap[chunk_offs >> 5] & 
-						((uint32)1 << (chunk_offs & 31))) != 0)
-					{
-						BITMAP_SET(bitmap, i);
-						found++;
-					}
+					BITMAP_SET(bitmap, i);
+					found++;
 				}
 			}
-			else
-			{
-				i += this_chunk;
-			}
 		}
 		else
 		{
-			LWLockRelease(lfc_lock);
-			return found;
+			i += this_chunk;
 		}

 		/*
 		 * Break out of the iteration before doing expensive stuff for
 		 * a next iteration
 		 */
-		if (i + 1 >= nblocks)
+		if (i >= nblocks)
 			break;

 		/*
@@ -563,8 +560,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-#if USE_ASSERT_CHECKING
-	do {
+#ifdef USE_ASSERT_CHECKING
+	{
 		int count = 0;

 		for (int j = 0; j < nblocks; j++)
@@ -574,7 +571,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}

 		Assert(count == found);
-	} while (false);
+	}
 #endif

 	return found;
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -36,6 +36,11 @@
 #include "pagestore_client.h"
 #include "walproposer.h"

+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#endif
+
 #define PageStoreTrace DEBUG5

 #define MIN_RECONNECT_INTERVAL_USEC 1000
@@ -728,11 +733,36 @@ retry:
 		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
 		if (INSTR_TIME_GET_MILLISEC(since_last_log) >= LOG_INTERVAL_MS)
 		{
+			int sndbuf = -1;
+			int recvbuf = -1;
+#ifdef __linux__
+			int socketfd;
+#endif
+
 			since_start = now;
 			INSTR_TIME_SUBTRACT(since_start, start_ts);
-			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+
+#ifdef __linux__
+			/*
+			 * get kernel's send and recv queue size via ioctl
+			 * https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27
+			 */
+			socketfd = PQsocket(pageserver_conn);
+			if (socketfd != -1) {
+				int ioctl_err;
+				ioctl_err = ioctl(socketfd, SIOCOUTQ, &sndbuf);
+				if (ioctl_err!= 0) {
+					sndbuf = -errno;
+				}
+				ioctl_err = ioctl(socketfd, FIONREAD, &recvbuf);
+				if (ioctl_err != 0) {
+					recvbuf = -errno;
+				}
+			}
+#endif
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
 						   INSTR_TIME_GET_DOUBLE(since_start),
-						   shard->nrequests_sent, shard->nresponses_received);
+						   shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
 			last_log_ts = now;
 			logged = true;
 		}
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -51,26 +51,6 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 	return true;
 }

-/* --------------------------------
- *		pq_getmsgint16	- get a binary 2-byte int from a message buffer
- * --------------------------------
- */
-uint16
-pq_getmsgint16(StringInfo msg)
-{
-	return pq_getmsgint(msg, 2);
-}
-
-/* --------------------------------
- *		pq_getmsgint32	- get a binary 4-byte int from a message buffer
- * --------------------------------
- */
-uint32
-pq_getmsgint32(StringInfo msg)
-{
-	return pq_getmsgint(msg, 4);
-}
-
 /* --------------------------------
 *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
 * --------------------------------
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,8 +8,6 @@
 #endif

 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
-uint16      pq_getmsgint16(StringInfo msg);
-uint32      pq_getmsgint32(StringInfo msg);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -916,7 +916,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 {
 	uint64		min_ring_index;
 	PrefetchRequest hashkey;
-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 	bool		any_hits = false;
 #endif
 	/* We will never read further ahead than our buffer can store. */
@@ -955,7 +955,7 @@ Retry:
 		else
 			lsns = NULL;

-#if USE_ASSERT_CHECKING
+#ifdef USE_ASSERT_CHECKING
 		any_hits = true;
 #endif

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,7 +70,6 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
-static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
 static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
@@ -82,8 +81,6 @@ static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
-static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
-static void MembershipConfigurationFree(MembershipConfiguration *mconf);

 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -140,21 +137,25 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;

-	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
-		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
-	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
-
 	/* Fill the greeting package */
-	wp->greetRequest.pam.tag = 'g';
-	if (!wp->config->neon_tenant)
-		wp_log(FATAL, "neon.tenant_id is not provided");
-	wp->greetRequest.tenant_id = wp->config->neon_tenant;
+	wp->greetRequest.tag = 'g';
+	wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
+	wp->greetRequest.pgVersion = PG_VERSION_NUM;
+	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
+	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
 		wp_log(FATAL, "neon.timeline_id is not provided");
-	wp->greetRequest.timeline_id = wp->config->neon_timeline;
-	wp->greetRequest.pg_version = PG_VERSION_NUM;
-	wp->greetRequest.system_id = wp->config->systemId;
-	wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;
+	if (*wp->config->neon_timeline != '\0' &&
+		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
+		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+	if (!wp->config->neon_tenant)
+		wp_log(FATAL, "neon.tenant_id is not provided");
+	if (*wp->config->neon_tenant != '\0' &&
+		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
+		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+
+	wp->greetRequest.timeline = wp->config->pgTimeline;
+	wp->greetRequest.walSegSize = wp->config->wal_segment_size;

 	wp->api.init_event_set(wp);

@@ -164,14 +165,12 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 void
 WalProposerFree(WalProposer *wp)
 {
-	MembershipConfigurationFree(&wp->mconf);
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];

 		Assert(sk->outbuf.data != NULL);
 		pfree(sk->outbuf.data);
-		MembershipConfigurationFree(&sk->greetResponse.mconf);
 		if (sk->voteResponse.termHistory.entries)
 			pfree(sk->voteResponse.termHistory.entries);
 		sk->voteResponse.termHistory.entries = NULL;
@@ -309,7 +308,6 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->streamingAt = InvalidXLogRecPtr;

-	MembershipConfigurationFree(&sk->greetResponse.mconf);
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
@@ -600,14 +598,11 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-#define CMD_LEN 512
-	char		cmd[CMD_LEN];

-	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
-	if (!wp->api.conn_send_query(sk, cmd))
+	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
-			   cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -663,33 +658,23 @@ RecvStartWALPushResult(Safekeeper *sk)

 /*
 * Start handshake: first of all send information about the
- * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
+ * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
 * a response to finish the handshake.
 */
 static void
 SendProposerGreeting(Safekeeper *sk)
 {
-	WalProposer *wp = sk->wp;
-	char	   *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
-
-	wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
-	pfree(mconf_toml);
-
-	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
-					   &sk->outbuf, wp->config->proto_version);
-
 	/*
 	 * On failure, logging & resetting the connection is handled. We just need
 	 * to handle the control flow.
 	 */
-	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
+	BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
 }

 static void
 RecvAcceptorGreeting(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	char	   *mconf_toml;

 	/*
 	 * If our reading doesn't immediately succeed, any necessary error
@@ -700,10 +685,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;

-	mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
-		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
-	pfree(mconf_toml);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);

 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -725,9 +707,12 @@ RecvAcceptorGreeting(Safekeeper *sk)
 			wp->propTerm++;
 			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);

-			wp->voteRequest.pam.tag = 'v';
-			wp->voteRequest.generation = wp->mconf.generation;
-			wp->voteRequest.term = wp->propTerm;
+			wp->voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = wp->propTerm
+			};
+			memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
 		}
 	}
 	else if (sk->greetResponse.term > wp->propTerm)
@@ -774,14 +759,12 @@ SendVoteRequest(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;

-	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
-					   &sk->outbuf, wp->config->proto_version);
-
 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
-	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
+	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
+		return;
+
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 }

@@ -795,12 +778,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
-		   sk->voteResponse.voteGiven,
-		   GetHighestTerm(&sk->voteResponse.termHistory),
+		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -865,9 +847,9 @@ HandleElectedProposer(WalProposer *wp)
 	 * otherwise we must be sync-safekeepers and we have nothing to do then.
 	 *
 	 * Proceeding is not only pointless but harmful, because we'd give
-	 * safekeepers term history starting with 0/0. These hacks will go away
-	 * once we disable implicit timeline creation on safekeepers and create it
-	 * with non zero LSN from the start.
+	 * safekeepers term history starting with 0/0. These hacks will go away once
+	 * we disable implicit timeline creation on safekeepers and create it with
+	 * non zero LSN from the start.
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
 	{
@@ -960,6 +942,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
+	wp->timelineStartLsn = InvalidXLogRecPtr;

 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
@@ -976,6 +959,20 @@ DetermineEpochStartLsn(WalProposer *wp)
 				wp->donor = i;
 			}
 			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
+			{
+				/* timelineStartLsn should be the same everywhere or unknown */
+				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
+					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
+				{
+					wp_log(WARNING,
+						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+				}
+				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
+			}
 		}
 	}

@@ -998,11 +995,22 @@ DetermineEpochStartLsn(WalProposer *wp)
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
 		wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
+		if (wp->timelineStartLsn == InvalidXLogRecPtr)
+		{
+			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
+		}
 		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 	pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);

-	Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);
+	/*
+	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so
+	 * it should never be zero at this point, if we know timelineStartLsn.
+	 *
+	 * timelineStartLsn can be zero only on the first syncSafekeepers run.
+	 */
+	Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
+		   (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));

 	/*
 	 * We will be generating WAL since propEpochStartLsn, so we should set
@@ -1045,11 +1053,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if last_log_term on the node which
-			 * gave the highest vote (i.e. point where we are going to start
-			 * writing) actually had been won by me; plain restart of
-			 * walproposer not intervened by concurrent compute which wrote
-			 * WAL is ok.
+			 * However, allow to proceed if last_log_term on the node which gave
+			 * the highest vote (i.e. point where we are going to start writing)
+			 * actually had been won by me; plain restart of walproposer not
+			 * intervened by concurrent compute which wrote WAL is ok.
 			 *
 			 * This avoids compute crash after manual term_bump.
 			 */
@@ -1119,8 +1126,14 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
-			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
+			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+
+		/*
+		 * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
+		 * is created manually (test_s3_wal_replay)
+		 */
+		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -1145,19 +1158,29 @@ SendProposerElected(Safekeeper *sk)

 	Assert(sk->startStreamingAt <= wp->availableLsn);

-	msg.apm.tag = 'e';
-	msg.generation = wp->mconf.generation;
+	msg.tag = 'e';
 	msg.term = wp->propTerm;
 	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &wp->propTermHistory;
+	msg.timelineStartLsn = wp->timelineStartLsn;

 	lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
 	wp_log(LOG,
-		   "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		   sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
-		   lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
+		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+
+	resetStringInfo(&sk->outbuf);
+	pq_sendint64_le(&sk->outbuf, msg.tag);
+	pq_sendint64_le(&sk->outbuf, msg.term);
+	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
+	}
+	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);

-	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
 	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;

@@ -1223,13 +1246,14 @@ static void
 PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
 {
 	Assert(endLsn >= beginLsn);
-	req->apm.tag = 'a';
-	req->generation = wp->mconf.generation;
+	req->tag = 'a';
 	req->term = wp->propTerm;
+	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
 	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
+	req->proposerId = wp->greetRequest.proposerId;
 }

 /*
@@ -1330,8 +1354,7 @@ SendAppendRequests(Safekeeper *sk)
 			resetStringInfo(&sk->outbuf);

 			/* write AppendRequest header */
-			PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
-			/* prepare for reading WAL into the outbuf */
+			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
 			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
 			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
@@ -1344,17 +1367,14 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			req_len = req->endLsn - req->beginLsn;

-			/*
-			 * We send zero sized AppenRequests as heartbeats; don't wal_read
-			 * for these.
-			 */
+			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
 			if (req_len > 0)
 			{
 				switch (wp->api.wal_read(sk,
-										 &sk->outbuf.data[sk->outbuf.len],
-										 req->beginLsn,
-										 req_len,
-										 &errmsg))
+										&sk->outbuf.data[sk->outbuf.len],
+										req->beginLsn,
+										req_len,
+										&errmsg))
 				{
 					case NEON_WALREAD_SUCCESS:
 						break;
@@ -1362,7 +1382,7 @@ SendAppendRequests(Safekeeper *sk)
 						return true;
 					case NEON_WALREAD_ERROR:
 						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-							   sk->host, sk->port, errmsg);
+							sk->host, sk->port, errmsg);
 						ShutdownConnection(sk);
 						return false;
 					default:
@@ -1450,11 +1470,11 @@ RecvAppendResponses(Safekeeper *sk)
 			 * Term has changed to higher one, probably another compute is
 			 * running. If this is the case we could PANIC as well because
 			 * likely it inserted some data and our basebackup is unsuitable
-			 * anymore. However, we also bump term manually (term_bump
-			 * endpoint) on safekeepers for migration purposes, in this case
-			 * we do want compute to stay alive. So restart walproposer with
-			 * FATAL instead of panicking; if basebackup is spoiled next
-			 * election will notice this.
+			 * anymore. However, we also bump term manually (term_bump endpoint)
+			 * on safekeepers for migration purposes, in this case we do want
+			 * compute to stay alive. So restart walproposer with FATAL instead
+			 * of panicking; if basebackup is spoiled next election will notice
+			 * this.
 			 */
 			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
@@ -1489,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese

 	for (i = 0; i < nkeys; i++)
 	{
-		const char *key = pq_getmsgrawstring(reply_message);
+		const char *key = pq_getmsgstring(reply_message);
 		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));

 		if (strcmp(key, "current_timeline_size") == 0)
@@ -1730,208 +1750,6 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 }

-/* Serialize MembershipConfiguration into buf. */
-static void
-MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
-{
-	uint32		i;
-
-	pq_sendint32(buf, mconf->generation);
-
-	pq_sendint32(buf, mconf->members.len);
-	for (i = 0; i < mconf->members.len; i++)
-	{
-		pq_sendint64(buf, mconf->members.m[i].node_id);
-		pq_send_ascii_string(buf, mconf->members.m[i].host);
-		pq_sendint16(buf, mconf->members.m[i].port);
-	}
-
-	/*
-	 * There is no special mark for absent new_members; zero members in
-	 * invalid, so zero len means absent.
-	 */
-	pq_sendint32(buf, mconf->new_members.len);
-	for (i = 0; i < mconf->new_members.len; i++)
-	{
-		pq_sendint64(buf, mconf->new_members.m[i].node_id);
-		pq_send_ascii_string(buf, mconf->new_members.m[i].host);
-		pq_sendint16(buf, mconf->new_members.m[i].port);
-	}
-}
-
-/* Serialize proposer -> acceptor message into buf using specified version */
-static void
-PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
-{
-	/* both version are supported currently until we fully migrate to 3 */
-	Assert(proto_version == 3 || proto_version == 2);
-
-	resetStringInfo(buf);
-
-	if (proto_version == 3)
-	{
-		/*
-		 * v2 sends structs for some messages as is, so commonly send tag only
-		 * for v3
-		 */
-		pq_sendint8(buf, msg->tag);
-
-		switch (msg->tag)
-		{
-			case 'g':
-				{
-					ProposerGreeting *m = (ProposerGreeting *) msg;
-
-					pq_send_ascii_string(buf, m->tenant_id);
-					pq_send_ascii_string(buf, m->timeline_id);
-					MembershipConfigurationSerialize(&m->mconf, buf);
-					pq_sendint32(buf, m->pg_version);
-					pq_sendint64(buf, m->system_id);
-					pq_sendint32(buf, m->wal_seg_size);
-					break;
-				}
-			case 'v':
-				{
-					VoteRequest *m = (VoteRequest *) msg;
-
-					pq_sendint32(buf, m->generation);
-					pq_sendint64(buf, m->term);
-					break;
-
-				}
-			case 'e':
-				{
-					ProposerElected *m = (ProposerElected *) msg;
-
-					pq_sendint32(buf, m->generation);
-					pq_sendint64(buf, m->term);
-					pq_sendint64(buf, m->startStreamingAt);
-					pq_sendint32(buf, m->termHistory->n_entries);
-					for (uint32 i = 0; i < m->termHistory->n_entries; i++)
-					{
-						pq_sendint64(buf, m->termHistory->entries[i].term);
-						pq_sendint64(buf, m->termHistory->entries[i].lsn);
-					}
-					break;
-				}
-			case 'a':
-				{
-					/*
-					 * Note: this serializes only AppendRequestHeader, caller
-					 * is expected to append WAL data later.
-					 */
-					AppendRequestHeader *m = (AppendRequestHeader *) msg;
-
-					pq_sendint32(buf, m->generation);
-					pq_sendint64(buf, m->term);
-					pq_sendint64(buf, m->beginLsn);
-					pq_sendint64(buf, m->endLsn);
-					pq_sendint64(buf, m->commitLsn);
-					pq_sendint64(buf, m->truncateLsn);
-					break;
-				}
-			default:
-				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
-		}
-		return;
-	}
-
-	if (proto_version == 2)
-	{
-		switch (msg->tag)
-		{
-			case 'g':
-				{
-					/* v2 sent struct as is */
-					ProposerGreeting *m = (ProposerGreeting *) msg;
-					ProposerGreetingV2 greetRequestV2;
-
-					/* Fill also v2 struct. */
-					greetRequestV2.tag = 'g';
-					greetRequestV2.protocolVersion = proto_version;
-					greetRequestV2.pgVersion = m->pg_version;
-
-					/*
-					 * v3 removed this field because it's easier to pass as
-					 * libq or START_WAL_PUSH options
-					 */
-					memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
-					greetRequestV2.systemId = wp->config->systemId;
-					if (*m->timeline_id != '\0' &&
-						!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
-						wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
-					if (*m->tenant_id != '\0' &&
-						!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
-						wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
-
-					greetRequestV2.timeline = wp->config->pgTimeline;
-					greetRequestV2.walSegSize = wp->config->wal_segment_size;
-
-					pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
-					break;
-				}
-			case 'v':
-				{
-					/* v2 sent struct as is */
-					VoteRequest *m = (VoteRequest *) msg;
-					VoteRequestV2 voteRequestV2;
-
-					voteRequestV2.tag = m->pam.tag;
-					voteRequestV2.term = m->term;
-					/* removed field */
-					memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
-					pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
-					break;
-				}
-			case 'e':
-				{
-					ProposerElected *m = (ProposerElected *) msg;
-
-					pq_sendint64_le(buf, m->apm.tag);
-					pq_sendint64_le(buf, m->term);
-					pq_sendint64_le(buf, m->startStreamingAt);
-					pq_sendint32_le(buf, m->termHistory->n_entries);
-					for (int i = 0; i < m->termHistory->n_entries; i++)
-					{
-						pq_sendint64_le(buf, m->termHistory->entries[i].term);
-						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
-					}
-					pq_sendint64_le(buf, 0);	/* removed timeline_start_lsn */
-					break;
-				}
-			case 'a':
-
-				/*
-				 * Note: this serializes only AppendRequestHeader, caller is
-				 * expected to append WAL data later.
-				 */
-				{
-					/* v2 sent struct as is */
-					AppendRequestHeader *m = (AppendRequestHeader *) msg;
-					AppendRequestHeaderV2 appendRequestHeaderV2;
-
-					appendRequestHeaderV2.tag = m->apm.tag;
-					appendRequestHeaderV2.term = m->term;
-					appendRequestHeaderV2.epochStartLsn = 0;	/* removed field */
-					appendRequestHeaderV2.beginLsn = m->beginLsn;
-					appendRequestHeaderV2.endLsn = m->endLsn;
-					appendRequestHeaderV2.commitLsn = m->commitLsn;
-					appendRequestHeaderV2.truncateLsn = m->truncateLsn;
-					/* removed field */
-					memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
-
-					pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
-					break;
-				}
-
-			default:
-				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
-		}
-		return;
-	}
-	wp_log(FATAL, "unexpected proto_version %d", proto_version);
-}
-
 /*
 * Try to read CopyData message from i'th safekeeper, resetting connection on
 * failure.
@@ -1961,37 +1779,6 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 	return false;
 }

-/* Deserialize membership configuration from buf to mconf. */
-static void
-MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
-{
-	uint32		i;
-
-	mconf->generation = pq_getmsgint32(buf);
-	mconf->members.len = pq_getmsgint32(buf);
-	mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
-	for (i = 0; i < mconf->members.len; i++)
-	{
-		const char *buf_host;
-
-		mconf->members.m[i].node_id = pq_getmsgint64(buf);
-		buf_host = pq_getmsgrawstring(buf);
-		strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
-		mconf->members.m[i].port = pq_getmsgint16(buf);
-	}
-	mconf->new_members.len = pq_getmsgint32(buf);
-	mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
-	for (i = 0; i < mconf->new_members.len; i++)
-	{
-		const char *buf_host;
-
-		mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
-		buf_host = pq_getmsgrawstring(buf);
-		strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
-		mconf->new_members.m[i].port = pq_getmsgint16(buf);
-	}
-}
-
 /*
 * Read next message with known type into provided struct, by reading a CopyData
 * block from the safekeeper's postgres connection, returning whether the read
@@ -2000,8 +1787,6 @@ MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo bu
 * If the read needs more polling, we return 'false' and keep the state
 * unmodified, waiting until it becomes read-ready to try again. If it fully
 * failed, a warning is emitted and the connection is reset.
- *
- * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
 */
 static bool
 AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
@@ -2010,154 +1795,82 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)

 	char	   *buf;
 	int			buf_size;
-	uint8		tag;
+	uint64		tag;
 	StringInfoData s;

 	if (!(AsyncRead(sk, &buf, &buf_size)))
 		return false;
-	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 	/* parse it */
 	s.data = buf;
 	s.len = buf_size;
-	s.maxlen = buf_size;
 	s.cursor = 0;

-	if (wp->config->proto_version == 3)
+	tag = pq_getmsgint64_le(&s);
+	if (tag != anymsg->tag)
 	{
-		tag = pq_getmsgbyte(&s);
-		if (tag != anymsg->tag)
-		{
-			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-				   sk->port, FormatSafekeeperState(sk));
-			ResetConnection(sk);
-			return false;
-		}
-		switch (tag)
-		{
-			case 'g':
-				{
-					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
-
-					msg->nodeId = pq_getmsgint64(&s);
-					MembershipConfigurationDeserialize(&msg->mconf, &s);
-					msg->term = pq_getmsgint64(&s);
-					pq_getmsgend(&s);
-					return true;
-				}
-			case 'v':
-				{
-					VoteResponse *msg = (VoteResponse *) anymsg;
-
-					msg->generation = pq_getmsgint32(&s);
-					msg->term = pq_getmsgint64(&s);
-					msg->voteGiven = pq_getmsgbyte(&s);
-					msg->flushLsn = pq_getmsgint64(&s);
-					msg->truncateLsn = pq_getmsgint64(&s);
-					msg->termHistory.n_entries = pq_getmsgint32(&s);
-					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
-					for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
-					{
-						msg->termHistory.entries[i].term = pq_getmsgint64(&s);
-						msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
-					}
-					pq_getmsgend(&s);
-					return true;
-				}
-			case 'a':
-				{
-					AppendResponse *msg = (AppendResponse *) anymsg;
-
-					msg->generation = pq_getmsgint32(&s);
-					msg->term = pq_getmsgint64(&s);
-					msg->flushLsn = pq_getmsgint64(&s);
-					msg->commitLsn = pq_getmsgint64(&s);
-					msg->hs.ts = pq_getmsgint64(&s);
-					msg->hs.xmin.value = pq_getmsgint64(&s);
-					msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
-					if (s.len > s.cursor)
-						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
-					else
-						msg->ps_feedback.present = false;
-					pq_getmsgend(&s);
-					return true;
-				}
-			default:
-				{
-					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
-					return false;
-				}
-		}
+		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			   sk->port, FormatSafekeeperState(sk));
+		ResetConnection(sk);
+		return false;
 	}
-	else if (wp->config->proto_version == 2)
+	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
+	switch (tag)
 	{
-		tag = pq_getmsgint64_le(&s);
-		if (tag != anymsg->tag)
-		{
-			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-				   sk->port, FormatSafekeeperState(sk));
-			ResetConnection(sk);
-			return false;
-		}
-		switch (tag)
-		{
-			case 'g':
+		case 'g':
+			{
+				AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+
+				msg->term = pq_getmsgint64_le(&s);
+				msg->nodeId = pq_getmsgint64_le(&s);
+				pq_getmsgend(&s);
+				return true;
+			}
+
+		case 'v':
+			{
+				VoteResponse *msg = (VoteResponse *) anymsg;
+
+				msg->term = pq_getmsgint64_le(&s);
+				msg->voteGiven = pq_getmsgint64_le(&s);
+				msg->flushLsn = pq_getmsgint64_le(&s);
+				msg->truncateLsn = pq_getmsgint64_le(&s);
+				msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+				msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+				for (int i = 0; i < msg->termHistory.n_entries; i++)
 				{
-					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
-
-					msg->term = pq_getmsgint64_le(&s);
-					msg->nodeId = pq_getmsgint64_le(&s);
-					pq_getmsgend(&s);
-					return true;
+					msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+					msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
 				}
+				msg->timelineStartLsn = pq_getmsgint64_le(&s);
+				pq_getmsgend(&s);
+				return true;
+			}

-			case 'v':
-				{
-					VoteResponse *msg = (VoteResponse *) anymsg;
+		case 'a':
+			{
+				AppendResponse *msg = (AppendResponse *) anymsg;

-					msg->term = pq_getmsgint64_le(&s);
-					msg->voteGiven = pq_getmsgint64_le(&s);
-					msg->flushLsn = pq_getmsgint64_le(&s);
-					msg->truncateLsn = pq_getmsgint64_le(&s);
-					msg->termHistory.n_entries = pq_getmsgint32_le(&s);
-					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
-					for (int i = 0; i < msg->termHistory.n_entries; i++)
-					{
-						msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
-						msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
-					}
-					pq_getmsgint64_le(&s);	/* timelineStartLsn */
-					pq_getmsgend(&s);
-					return true;
-				}
+				msg->term = pq_getmsgint64_le(&s);
+				msg->flushLsn = pq_getmsgint64_le(&s);
+				msg->commitLsn = pq_getmsgint64_le(&s);
+				msg->hs.ts = pq_getmsgint64_le(&s);
+				msg->hs.xmin.value = pq_getmsgint64_le(&s);
+				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+				if (s.len > s.cursor)
+					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+				else
+					msg->ps_feedback.present = false;
+				pq_getmsgend(&s);
+				return true;
+			}

-			case 'a':
-				{
-					AppendResponse *msg = (AppendResponse *) anymsg;
-
-					msg->term = pq_getmsgint64_le(&s);
-					msg->flushLsn = pq_getmsgint64_le(&s);
-					msg->commitLsn = pq_getmsgint64_le(&s);
-					msg->hs.ts = pq_getmsgint64_le(&s);
-					msg->hs.xmin.value = pq_getmsgint64_le(&s);
-					msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-					if (s.len > s.cursor)
-						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
-					else
-						msg->ps_feedback.present = false;
-					pq_getmsgend(&s);
-					return true;
-				}
-
-			default:
-				{
-					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
-					return false;
-				}
-		}
+		default:
+			{
+				Assert(false);
+				return false;
+			}
 	}
-	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
-	return false; /* keep the compiler quiet */
 }

 /*
@@ -2533,45 +2246,3 @@ FormatEvents(WalProposer *wp, uint32 events)

 	return (char *) &return_str;
 }
-
-/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
-static char *
-MembershipConfigurationToString(MembershipConfiguration *mconf)
-{
-	StringInfoData s;
-	uint32		i;
-
-	initStringInfo(&s);
-	appendStringInfo(&s, "{gen = %u", mconf->generation);
-	appendStringInfoString(&s, ", members = [");
-	for (i = 0; i < mconf->members.len; i++)
-	{
-		if (i > 0)
-			appendStringInfoString(&s, ", ");
-		appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
-		appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
-		appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
-	}
-	appendStringInfo(&s, "], new_members = [");
-	for (i = 0; i < mconf->new_members.len; i++)
-	{
-		if (i > 0)
-			appendStringInfoString(&s, ", ");
-		appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
-		appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
-		appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
-	}
-	appendStringInfoString(&s, "]}");
-	return s.data;
-}
-
-static void
-MembershipConfigurationFree(MembershipConfiguration *mconf)
-{
-	if (mconf->members.m)
-		pfree(mconf->members.m);
-	mconf->members.m = NULL;
-	if (mconf->new_members.m)
-		pfree(mconf->new_members.m);
-	mconf->new_members.m = NULL;
-}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -12,6 +12,9 @@
 #include "neon_walreader.h"
 #include "pagestore_client.h"

+#define SK_MAGIC 0xCafeCeefu
+#define SK_PROTOCOL_VERSION 2
+
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
@@ -140,71 +143,12 @@ typedef uint64 term_t;
 /* neon storage node id */
 typedef uint64 NNodeId;

-/*
- * Number uniquely identifying safekeeper membership configuration.
- * This and following structs pair ones in membership.rs.
- */
-typedef uint32 Generation;
-
-typedef struct SafekeeperId
-{
-	NNodeId		node_id;
-	char		host[MAXCONNINFO];
-	uint16		port;
-} SafekeeperId;
-
-/* Set of safekeepers. */
-typedef struct MemberSet
-{
-	uint32		len;			/* number of members */
-	SafekeeperId *m;			/* ids themselves */
-} MemberSet;
-
-/* Timeline safekeeper membership configuration. */
-typedef struct MembershipConfiguration
-{
-	Generation	generation;
-	MemberSet	members;
-	/* Has 0 n_members in non joint conf. */
-	MemberSet	new_members;
-} MembershipConfiguration;
-
 /*
 * Proposer <-> Acceptor messaging.
 */

-typedef struct ProposerAcceptorMessage
-{
-	uint8		tag;
-} ProposerAcceptorMessage;
-
 /* Initial Proposer -> Acceptor message */
 typedef struct ProposerGreeting
-{
-	ProposerAcceptorMessage pam;	/* message tag */
-
-	/*
-	 * tenant/timeline ids as C strings with standard hex notation for ease of
-	 * printing. In principle they are not strictly needed as ttid is also
-	 * passed as libpq options.
-	 */
-	char	   *tenant_id;
-	char	   *timeline_id;
-	/* Full conf is carried to allow safekeeper switch */
-	MembershipConfiguration mconf;
-
-	/*
-	 * pg_version and wal_seg_size are used for timeline creation until we
-	 * fully migrate to doing externally. systemId is only used as a sanity
-	 * cross check.
-	 */
-	uint32		pg_version;		/* in PG_VERSION_NUM format */
-	uint64		system_id;		/* Postgres system identifier. */
-	uint32		wal_seg_size;
-} ProposerGreeting;
-
-/* protocol v2 variant, kept while wp supports it */
-typedef struct ProposerGreetingV2
 {
 	uint64		tag;			/* message tag */
 	uint32		protocolVersion;	/* proposer-safekeeper protocol version */
@@ -215,42 +159,32 @@ typedef struct ProposerGreetingV2
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-} ProposerGreetingV2;
+} ProposerGreeting;

 typedef struct AcceptorProposerMessage
 {
-	uint8		tag;
+	uint64		tag;
 } AcceptorProposerMessage;

 /*
- * Acceptor -> Proposer initial response: the highest term acceptor voted for,
- * its node id and configuration.
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for.
 */
 typedef struct AcceptorGreeting
 {
 	AcceptorProposerMessage apm;
-	NNodeId		nodeId;
-	MembershipConfiguration mconf;
 	term_t		term;
+	NNodeId		nodeId;
 } AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
 */
 typedef struct VoteRequest
-{
-	ProposerAcceptorMessage pam;	/* message tag */
-	Generation	generation;		/* membership conf generation */
-	term_t		term;
-} VoteRequest;
-
-/* protocol v2 variant, kept while wp supports it */
-typedef struct VoteRequestV2
 {
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} VoteRequestV2;
+} VoteRequest;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
@@ -269,15 +203,8 @@ typedef struct TermHistory
 typedef struct VoteResponse
 {
 	AcceptorProposerMessage apm;
-
-	/*
-	 * Membership conf generation. It's not strictly required because on
-	 * mismatch safekeeper is expected to ERROR the connection, but let's
-	 * sanity check it.
-	 */
-	Generation	generation;
 	term_t		term;
-	uint8		voteGiven;
+	uint64		voteGiven;

 	/*
 	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
@@ -287,6 +214,7 @@ typedef struct VoteResponse
 	XLogRecPtr	truncateLsn;	/* minimal LSN which may be needed for*
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
+	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
 } VoteResponse;

 /*
@@ -295,37 +223,20 @@ typedef struct VoteResponse
 */
 typedef struct ProposerElected
 {
-	AcceptorProposerMessage apm;
-	Generation	generation;		/* membership conf generation */
+	uint64		tag;
 	term_t		term;
 	/* proposer will send since this point */
 	XLogRecPtr	startStreamingAt;
 	/* history of term switches up to this proposer */
 	TermHistory *termHistory;
+	/* timeline globally starts at this LSN */
+	XLogRecPtr	timelineStartLsn;
 } ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
 */
 typedef struct AppendRequestHeader
-{
-	AcceptorProposerMessage apm;
-	Generation	generation;		/* membership conf generation */
-	term_t		term;			/* term of the proposer */
-	XLogRecPtr	beginLsn;		/* start position of message in WAL */
-	XLogRecPtr	endLsn;			/* end position of message in WAL */
-	XLogRecPtr	commitLsn;		/* LSN committed by quorum of safekeepers */
-
-	/*
-	 * minimal LSN which may be needed for recovery of some safekeeper (end
-	 * lsn + 1 of last chunk streamed to everyone)
-	 */
-	XLogRecPtr	truncateLsn;
-	/* in the AppendRequest message, WAL data follows */
-} AppendRequestHeader;
-
-/* protocol v2 variant, kept while wp supports it */
-typedef struct AppendRequestHeaderV2
 {
 	uint64		tag;
 	term_t		term;			/* term of the proposer */
@@ -345,8 +256,7 @@ typedef struct AppendRequestHeaderV2
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-	/* in the AppendRequest message, WAL data follows */
-} AppendRequestHeaderV2;
+} AppendRequestHeader;

 /*
 * Hot standby feedback received from replica
@@ -399,13 +309,6 @@ typedef struct AppendResponse
 {
 	AcceptorProposerMessage apm;

-	/*
-	 * Membership conf generation. It's not strictly required because on
-	 * mismatch safekeeper is expected to ERROR the connection, but let's
-	 * sanity check it.
-	 */
-	Generation	generation;
-
 	/*
 	 * Current term of the safekeeper; if it is higher than proposer's, the
 	 * compute is out of date.
@@ -741,8 +644,6 @@ typedef struct WalProposerConfig
 	/* Will be passed to safekeepers in greet request. */
 	TimeLineID	pgTimeline;

-	int			proto_version;
-
 #ifdef WALPROPOSER_LIB
 	void	   *callback_data;
 #endif
@@ -755,14 +656,11 @@ typedef struct WalProposerConfig
 typedef struct WalProposer
 {
 	WalProposerConfig *config;
-	/* Current walproposer membership configuration */
-	MembershipConfiguration mconf;
+	int			n_safekeepers;

 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;

-	/* Number of occupied slots in safekeepers[] */
-	int			n_safekeepers;
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];

 	/* WAL has been generated up to this point */
@@ -772,7 +670,6 @@ typedef struct WalProposer
 	XLogRecPtr	commitLsn;

 	ProposerGreeting greetRequest;
-	ProposerGreetingV2 greetRequestV2;

 	/* Vote request for safekeeper */
 	VoteRequest voteRequest;
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -117,13 +117,14 @@ pq_getmsgbytes(StringInfo msg, int datalen)
 }

 /* --------------------------------
- *		pq_getmsgrawstring - get a null-terminated text string - NO conversion
+ *		pq_getmsgstring - get a null-terminated text string (with conversion)
 *
- *		Returns a pointer directly into the message buffer.
+ *		May return a pointer directly into the message buffer, or a pointer
+ *		to a palloc'd conversion result.
 * --------------------------------
 */
 const char *
-pq_getmsgrawstring(StringInfo msg)
+pq_getmsgstring(StringInfo msg)
 {
 	char	   *str;
 	int			slen;
@@ -154,45 +155,6 @@ pq_getmsgend(StringInfo msg)
 		ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
 }

-/* --------------------------------
- *		pq_sendbytes	- append raw data to a StringInfo buffer
- * --------------------------------
- */
-void
-pq_sendbytes(StringInfo buf, const void *data, int datalen)
-{
-	/* use variant that maintains a trailing null-byte, out of caution */
-	appendBinaryStringInfo(buf, data, datalen);
-}
-
-/* --------------------------------
- *		pq_send_ascii_string	- append a null-terminated text string (without conversion)
- *
- * This function intentionally bypasses encoding conversion, instead just
- * silently replacing any non-7-bit-ASCII characters with question marks.
- * It is used only when we are having trouble sending an error message to
- * the client with normal localization and encoding conversion.  The caller
- * should already have taken measures to ensure the string is just ASCII;
- * the extra work here is just to make certain we don't send a badly encoded
- * string to the client (which might or might not be robust about that).
- *
- * NB: passed text string must be null-terminated, and so is the data
- * sent to the frontend.
- * --------------------------------
- */
-void
-pq_send_ascii_string(StringInfo buf, const char *str)
-{
-	while (*str)
-	{
-		char		ch = *str++;
-
-		if (IS_HIGHBIT_SET(ch))
-			ch = '?';
-		appendStringInfoCharMacro(buf, ch);
-	}
-	appendStringInfoChar(buf, '\0');
-}

 /*
 * Produce a C-string representation of a TimestampTz.
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -59,11 +59,9 @@

 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

-/* GUCs */
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
-int			safekeeper_proto_version = 2;

 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -128,7 +126,6 @@ init_walprop_config(bool syncSafekeepers)
 	else
 		walprop_config.systemId = 0;
 	walprop_config.pgTimeline = walprop_pg_get_timeline_id();
-	walprop_config.proto_version = safekeeper_proto_version;
 }

 /*
@@ -222,37 +219,25 @@ nwp_register_gucs(void)
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
-
-	DefineCustomIntVariable(
-							"neon.safekeeper_proto_version",
-							"Version of compute <-> safekeeper protocol.",
-							"Used while migrating from 2 to 3.",
-							&safekeeper_proto_version,
-							2, 0, INT_MAX,
-							PGC_POSTMASTER,
-							0,
-							NULL, NULL, NULL);
 }


 static int
 split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 {
-	int			n_safekeepers = 0;
-	char	   *curr_sk = safekeepers_list;
+	int n_safekeepers = 0;
+	char *curr_sk = safekeepers_list;

 	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
 	{
-		if (++n_safekeepers >= MAX_SAFEKEEPERS)
-		{
+		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
 			wpg_log(FATAL, "too many safekeepers");
 		}

 		coma = strchr(coma, ',');
-		safekeepers[n_safekeepers - 1] = curr_sk;
+		safekeepers[n_safekeepers-1] = curr_sk;

-		if (coma != NULL)
-		{
+		if (coma != NULL) {
 			*coma++ = '\0';
 		}
 	}
@@ -267,10 +252,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 static bool
 safekeepers_cmp(char *old, char *new)
 {
-	char	   *safekeepers_old[MAX_SAFEKEEPERS];
-	char	   *safekeepers_new[MAX_SAFEKEEPERS];
-	int			len_old = 0;
-	int			len_new = 0;
+	char *safekeepers_old[MAX_SAFEKEEPERS];
+	char *safekeepers_new[MAX_SAFEKEEPERS];
+	int len_old = 0;
+	int len_new = 0;

 	len_old = split_safekeepers_list(old, safekeepers_old);
 	len_new = split_safekeepers_list(new, safekeepers_new);
@@ -307,8 +292,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	if (!am_walproposer)
 		return;

-	if (!newval)
-	{
+	if (!newval) {
 		/* should never happen */
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}
@@ -317,11 +301,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	newval_copy = pstrdup(newval);
 	oldval = pstrdup(wal_acceptors_list);

-	/*
+	/* 
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
-	 * next bgw start. We should refactor walproposer to allow graceful exit
-	 * and thus remove this delay. XXX: If you change anything here, sync with
-	 * test_safekeepers_reconfigure_reorder.
+	 * next bgw start. We should refactor walproposer to allow graceful exit and
+	 * thus remove this delay.
+	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
 	 */
 	if (!safekeepers_cmp(oldval, newval_copy))
 	{
@@ -470,8 +454,7 @@ backpressure_throttling_impl(void)
 	memcpy(new_status, old_status, len);
 	snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
 	set_ps_display(new_status);
-	new_status[len] = '\0';		/* truncate off " backpressure ..." to later
-								 * reset the ps */
+	new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */

 	elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 	start = GetCurrentTimestamp();
@@ -638,7 +621,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
 			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = wp->config->pgTimeline;
+	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
 	StartProposerReplication(wp, &cmd);
 }
@@ -1980,11 +1963,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 		FullTransactionId xmin = hsFeedback.xmin;
 		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
 		FullTransactionId next_xid = ReadNextFullTransactionId();
-
 		/*
-		 * Page server is updating nextXid in checkpoint each 1024
-		 * transactions, so feedback xmin can be actually larger then nextXid
-		 * and function TransactionIdInRecentPast return false in this case,
+		 * Page server is updating nextXid in checkpoint each 1024 transactions,
+		 * so feedback xmin can be actually larger then nextXid and
+		 * function TransactionIdInRecentPast return false in this case,
 		 * preventing update of slot's xmin.
 		 */
 		if (FullTransactionIdPrecedes(next_xid, xmin))
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,6 +19,7 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
+boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -42,6 +43,7 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
+gettid = "0.1.3"
 indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
@@ -50,6 +52,8 @@ lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry = { workspace = true, features = ["trace"] }
+papaya = "0.1.8"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
@@ -89,6 +93,9 @@ tokio = { workspace = true, features = ["signal"] }
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
+tracing-log.workspace = true
+tracing-serde.workspace = true
+tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
 url.workspace = true
@@ -112,6 +119,7 @@ rsa = "0.9"
 workspace_hack.workspace = true

 [dev-dependencies]
+assert-json-diff.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 flate2.workspace = true
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,10 +1,23 @@
-use tracing::Subscriber;
+use std::cell::{Cell, RefCell};
+use std::collections::HashMap;
+use std::hash::BuildHasher;
+use std::{env, io};
+
+use chrono::{DateTime, Utc};
+use opentelemetry::trace::TraceContextExt;
+use scopeguard::defer;
+use serde::ser::{SerializeMap, Serializer};
+use tracing::span;
+use tracing::subscriber::Interest;
+use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
 use tracing_subscriber::fmt::time::SystemTime;
 use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
-use tracing_subscriber::registry::LookupSpan;
+use tracing_subscriber::registry::{LookupSpan, SpanRef};

 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -15,6 +28,8 @@ use tracing_subscriber::registry::LookupSpan;
 /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
 /// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
 pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let logfmt = LogFormat::from_env()?;
+
    let env_filter = EnvFilter::builder()
        .with_default_directive(LevelFilter::INFO.into())
        .from_env_lossy()
@@ -29,17 +44,36 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
                .expect("this should be a valid filter directive"),
        );

-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_ansi(false)
-        .with_writer(std::io::stderr)
-        .with_target(false);
-
    let otlp_layer = tracing_utils::init_tracing("proxy").await;

+    let json_log_layer = if logfmt == LogFormat::Json {
+        Some(JsonLoggingLayer {
+            clock: RealClock,
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: StderrWriter {
+                stderr: std::io::stderr(),
+            },
+        })
+    } else {
+        None
+    };
+
+    let text_log_layer = if logfmt == LogFormat::Text {
+        Some(
+            tracing_subscriber::fmt::layer()
+                .with_ansi(false)
+                .with_writer(std::io::stderr)
+                .with_target(false),
+        )
+    } else {
+        None
+    };
+
    tracing_subscriber::registry()
        .with(env_filter)
        .with(otlp_layer)
-        .with(fmt_layer)
+        .with(json_log_layer)
+        .with(text_log_layer)
        .try_init()?;

    Ok(LoggingGuard)
@@ -94,3 +128,857 @@ impl Drop for LoggingGuard {
        tracing_utils::shutdown_tracing();
    }
 }
+
+// TODO: make JSON the default
+#[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
+enum LogFormat {
+    #[default]
+    Text = 1,
+    Json,
+}
+
+impl LogFormat {
+    fn from_env() -> anyhow::Result<Self> {
+        let logfmt = env::var("LOGFMT");
+        Ok(match logfmt.as_deref() {
+            Err(_) => LogFormat::default(),
+            Ok("text") => LogFormat::Text,
+            Ok("json") => LogFormat::Json,
+            Ok(logfmt) => anyhow::bail!("unknown log format: {logfmt}"),
+        })
+    }
+}
+
+trait MakeWriter {
+    fn make_writer(&self) -> impl io::Write;
+}
+
+struct StderrWriter {
+    stderr: io::Stderr,
+}
+
+impl MakeWriter for StderrWriter {
+    #[inline]
+    fn make_writer(&self) -> impl io::Write {
+        self.stderr.lock()
+    }
+}
+
+// TODO: move into separate module or even separate crate.
+trait Clock {
+    fn now(&self) -> DateTime<Utc>;
+}
+
+struct RealClock;
+
+impl Clock for RealClock {
+    #[inline]
+    fn now(&self) -> DateTime<Utc> {
+        Utc::now()
+    }
+}
+
+/// Name of the field used by tracing crate to store the event message.
+const MESSAGE_FIELD: &str = "message";
+
+thread_local! {
+    /// Protects against deadlocks and double panics during log writing.
+    /// The current panic handler will use tracing to log panic information.
+    static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
+    /// Thread-local instance with per-thread buffer for log writing.
+    static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
+    /// Cached OS thread ID.
+    static THREAD_ID: u64 = gettid::gettid();
+}
+
+/// Implements tracing layer to handle events specific to logging.
+struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+    clock: C,
+    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    writer: W,
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+{
+    fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) {
+        use std::io::Write;
+
+        // TODO: consider special tracing subscriber to grab timestamp very
+        //       early, before OTel machinery, and add as event extension.
+        let now = self.clock.now();
+
+        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
+            if entered.get() {
+                let mut formatter = EventFormatter::new();
+                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                self.writer.make_writer().write_all(formatter.buffer())
+            } else {
+                entered.set(true);
+                defer!(entered.set(false););
+
+                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
+                    formatter.reset();
+                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    self.writer.make_writer().write_all(formatter.buffer())
+                })
+            }
+        });
+
+        // In case logging fails we generate a simpler JSON object.
+        if let Err(err) = res {
+            if let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
+                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": format_args!("cannot log event: {err:?}"),
+                "fields": {
+                    "event": format_args!("{event:?}"),
+                },
+            })) {
+                line.push(b'\n');
+                self.writer.make_writer().write_all(&line).ok();
+            }
+        }
+    }
+
+    /// Registers a SpanFields instance as span extension.
+    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let fields = SpanFields::default();
+        fields.record_fields(attrs);
+        // This could deadlock when there's a panic somewhere in the tracing
+        // event handling and a read or write guard is still held. This includes
+        // the OTel subscriber.
+        span.extensions_mut().insert(fields);
+    }
+
+    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
+        let span = ctx.span(id).expect("span must exist");
+        let ext = span.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            data.record_fields(values);
+        }
+    }
+
+    /// Called (lazily) whenever a new log call is executed. We quickly check
+    /// for duplicate field names and record duplicates as skippable. Last one
+    /// wins.
+    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
+        if !metadata.is_event() {
+            // Must not be never because we wouldn't get trace and span data.
+            return Interest::always();
+        }
+
+        let mut field_indices = SkippedFieldIndices::default();
+        let mut seen_fields = HashMap::<&'static str, usize>::new();
+        for field in metadata.fields() {
+            use std::collections::hash_map::Entry;
+            match seen_fields.entry(field.name()) {
+                Entry::Vacant(entry) => {
+                    // field not seen yet
+                    entry.insert(field.index());
+                }
+                Entry::Occupied(mut entry) => {
+                    // replace currently stored index
+                    let old_index = entry.insert(field.index());
+                    // ... and append it to list of skippable indices
+                    field_indices.push(old_index);
+                }
+            }
+        }
+
+        if !field_indices.is_empty() {
+            self.skipped_field_indices
+                .pin()
+                .insert(metadata.callsite(), field_indices);
+        }
+
+        Interest::always()
+    }
+}
+
+/// Stores span field values recorded during the spans lifetime.
+#[derive(Default)]
+struct SpanFields {
+    // TODO: Switch to custom enum with lasso::Spur for Strings?
+    fields: papaya::HashMap<&'static str, serde_json::Value>,
+}
+
+impl SpanFields {
+    #[inline]
+    fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
+        fields.record(&mut SpanFieldsRecorder {
+            fields: self.fields.pin(),
+        });
+    }
+}
+
+/// Implements a tracing field visitor to convert and store values.
+struct SpanFieldsRecorder<'m, S, G> {
+    fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
+}
+
+impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if let Ok(value) = i64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if let Ok(value) = u64::try_from(value) {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(value));
+        } else {
+            self.fields
+                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(value));
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value:?}")));
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        self.fields
+            .insert(field.name(), serde_json::Value::from(format!("{value}")));
+    }
+}
+
+/// List of field indices skipped during logging. Can list duplicate fields or
+/// metafields not meant to be logged.
+#[derive(Clone, Default)]
+struct SkippedFieldIndices {
+    bits: u64,
+}
+
+impl SkippedFieldIndices {
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.bits == 0
+    }
+
+    #[inline]
+    fn push(&mut self, index: usize) {
+        self.bits |= 1u64
+            .checked_shl(index as u32)
+            .expect("field index too large");
+    }
+
+    #[inline]
+    fn contains(&self, index: usize) -> bool {
+        self.bits
+            & 1u64
+                .checked_shl(index as u32)
+                .expect("field index too large")
+            != 0
+    }
+}
+
+/// Formats a tracing event and writes JSON to its internal buffer including a newline.
+// TODO: buffer capacity management, truncate if too large
+struct EventFormatter {
+    logline_buffer: Vec<u8>,
+}
+
+impl EventFormatter {
+    #[inline]
+    fn new() -> Self {
+        EventFormatter {
+            logline_buffer: Vec::new(),
+        }
+    }
+
+    #[inline]
+    fn buffer(&self) -> &[u8] {
+        &self.logline_buffer
+    }
+
+    #[inline]
+    fn reset(&mut self) {
+        self.logline_buffer.clear();
+    }
+
+    fn format<S>(
+        &mut self,
+        now: DateTime<Utc>,
+        event: &Event<'_>,
+        ctx: &Context<'_, S>,
+        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    ) -> io::Result<()>
+    where
+        S: Subscriber + for<'a> LookupSpan<'a>,
+    {
+        let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
+
+        use tracing_log::NormalizeEvent;
+        let normalized_meta = event.normalized_metadata();
+        let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
+
+        let skipped_field_indices = skipped_field_indices.pin();
+        let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
+
+        let mut serialize = || {
+            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
+
+            let mut serializer = serializer.serialize_map(None)?;
+
+            // Timestamp comes first, so raw lines can be sorted by timestamp.
+            serializer.serialize_entry("timestamp", &timestamp)?;
+
+            // Level next.
+            serializer.serialize_entry("level", &meta.level().as_str())?;
+
+            // Message next.
+            serializer.serialize_key("message")?;
+            let mut message_extractor =
+                MessageFieldExtractor::new(serializer, skipped_field_indices);
+            event.record(&mut message_extractor);
+            let mut serializer = message_extractor.into_serializer()?;
+
+            let mut fields_present = FieldsPresent(false, skipped_field_indices);
+            event.record(&mut fields_present);
+            if fields_present.0 {
+                serializer.serialize_entry(
+                    "fields",
+                    &SerializableEventFields(event, skipped_field_indices),
+                )?;
+            }
+
+            let pid = std::process::id();
+            if pid != 1 {
+                serializer.serialize_entry("process_id", &pid)?;
+            }
+
+            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
+
+            // TODO: tls cache? name could change
+            if let Some(thread_name) = std::thread::current().name() {
+                if !thread_name.is_empty() && thread_name != "tokio-runtime-worker" {
+                    serializer.serialize_entry("thread_name", thread_name)?;
+                }
+            }
+
+            if let Some(task_id) = tokio::task::try_id() {
+                serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
+            }
+
+            serializer.serialize_entry("target", meta.target())?;
+
+            if let Some(module) = meta.module_path() {
+                if module != meta.target() {
+                    serializer.serialize_entry("module", module)?;
+                }
+            }
+
+            if let Some(file) = meta.file() {
+                if let Some(line) = meta.line() {
+                    serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
+                } else {
+                    serializer.serialize_entry("src", file)?;
+                }
+            }
+
+            {
+                let otel_context = Span::current().context();
+                let otel_spanref = otel_context.span();
+                let span_context = otel_spanref.span_context();
+                if span_context.is_valid() {
+                    serializer.serialize_entry(
+                        "trace_id",
+                        &format_args!("{}", span_context.trace_id()),
+                    )?;
+                }
+            }
+
+            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+
+            serializer.end()
+        };
+
+        serialize().map_err(io::Error::other)?;
+        self.logline_buffer.push(b'\n');
+        Ok(())
+    }
+}
+
+/// Extracts the message field that's mixed will other fields.
+struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Option<Result<(), S::Error>>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: None,
+        }
+    }
+
+    #[inline]
+    fn into_serializer(mut self) -> Result<S, S::Error> {
+        match self.state {
+            Some(Ok(())) => {}
+            Some(Err(err)) => return Err(err),
+            None => self.serializer.serialize_value("")?,
+        }
+        Ok(self.serializer)
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_none()
+            && field.name() == MESSAGE_FIELD
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&value));
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
+        }
+    }
+}
+
+/// Checks if there's any fields and field values present. If not, the JSON subobject
+/// can be skipped.
+// This is entirely optional and only cosmetic, though maybe helps a
+// bit during log parsing in dashboards when there's no field with empty object.
+struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
+
+// Even though some methods have an overhead (error, bytes) it is assumed the
+// compiler won't include this since we ignore the value entirely.
+impl tracing::field::Visit for FieldsPresent<'_> {
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
+        if !self.1.is_some_and(|i| i.contains(field.index()))
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+        {
+            self.0 |= true;
+        }
+    }
+}
+
+/// Serializes the fields directly supplied with a log event.
+struct SerializableEventFields<'a, 'event>(
+    &'a tracing::Event<'event>,
+    Option<&'a SkippedFieldIndices>,
+);
+
+impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let serializer = serializer.serialize_map(None)?;
+        let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
+        self.0.record(&mut message_skipper);
+        let serializer = message_skipper.into_serializer()?;
+        serializer.end()
+    }
+}
+
+/// A tracing field visitor that skips the message field.
+struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
+    serializer: S,
+    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    state: Result<(), S::Error>,
+}
+
+impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
+    #[inline]
+    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+        Self {
+            serializer,
+            skipped_field_indices,
+            state: Ok(()),
+        }
+    }
+
+    #[inline]
+    fn accept_field(&self, field: &tracing::field::Field) -> bool {
+        self.state.is_ok()
+            && field.name() != MESSAGE_FIELD
+            && !field.name().starts_with("log.")
+            && !self
+                .skipped_field_indices
+                .is_some_and(|i| i.contains(field.index()))
+    }
+
+    #[inline]
+    fn into_serializer(self) -> Result<S, S::Error> {
+        self.state?;
+        Ok(self.serializer)
+    }
+}
+
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
+    #[inline]
+    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:x?}"));
+        }
+    }
+
+    #[inline]
+    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_entry(field.name(), &value);
+        }
+    }
+
+    #[inline]
+    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
+        if self.accept_field(field) {
+            self.state = self
+                .serializer
+                .serialize_entry(field.name(), &format_args!("{value:?}"));
+        }
+    }
+
+    #[inline]
+    fn record_error(
+        &mut self,
+        field: &tracing::field::Field,
+        value: &(dyn std::error::Error + 'static),
+    ) {
+        if self.accept_field(field) {
+            self.state = self.serializer.serialize_value(&format_args!("{value}"));
+        }
+    }
+}
+
+/// Serializes the span stack from root to leaf (parent of event) enumerated
+/// inside an object where the keys are just the number padded with zeroes
+/// to retain sorting order.
+// The object is necessary because Loki cannot flatten arrays.
+struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+where
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        if let Some(leaf_span) = self.0.lookup_current() {
+            for (i, span) in leaf_span.scope().from_root().enumerate() {
+                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+/// Serializes a single span. Include the span ID, name and its fields as
+/// recorded up to this point.
+struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
+where
+    Span: for<'lookup> LookupSpan<'lookup>;
+
+impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    where
+        Ser: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+        // TODO: the span ID is probably only useful for debugging tracing.
+        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
+        serializer.serialize_entry("span_name", self.0.metadata().name())?;
+
+        let ext = self.0.extensions();
+        if let Some(data) = ext.get::<SpanFields>() {
+            for (key, value) in &data.fields.pin() {
+                serializer.serialize_entry(key, value)?;
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used)]
+mod tests {
+    use std::sync::{Arc, Mutex, MutexGuard};
+
+    use assert_json_diff::assert_json_eq;
+    use tracing::info_span;
+
+    use super::*;
+
+    struct TestClock {
+        current_time: Mutex<DateTime<Utc>>,
+    }
+
+    impl Clock for Arc<TestClock> {
+        fn now(&self) -> DateTime<Utc> {
+            *self.current_time.lock().expect("poisoned")
+        }
+    }
+
+    struct VecWriter<'a> {
+        buffer: MutexGuard<'a, Vec<u8>>,
+    }
+
+    impl MakeWriter for Arc<Mutex<Vec<u8>>> {
+        fn make_writer(&self) -> impl io::Write {
+            VecWriter {
+                buffer: self.lock().expect("poisoned"),
+            }
+        }
+    }
+
+    impl io::Write for VecWriter<'_> {
+        fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+            self.buffer.write(buf)
+        }
+
+        fn flush(&mut self) -> io::Result<()> {
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_field_collection() {
+        let clock = Arc::new(TestClock {
+            current_time: Mutex::new(Utc::now()),
+        });
+        let buffer = Arc::new(Mutex::new(Vec::new()));
+        let log_layer = JsonLoggingLayer {
+            clock: clock.clone(),
+            skipped_field_indices: papaya::HashMap::default(),
+            writer: buffer.clone(),
+        };
+
+        let registry = tracing_subscriber::Registry::default().with(log_layer);
+
+        tracing::subscriber::with_default(registry, || {
+            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
+                info_span!("span2").in_scope(|| {
+                    tracing::error!(
+                        a = 1,
+                        a = 2,
+                        a = 3,
+                        message = "explicit message field",
+                        "implicit message field"
+                    );
+                });
+            });
+        });
+
+        let buffer = Arc::try_unwrap(buffer)
+            .expect("no other reference")
+            .into_inner()
+            .expect("poisoned");
+        let actual: serde_json::Value = serde_json::from_slice(&buffer).expect("valid JSON");
+        let expected: serde_json::Value = serde_json::json!(
+            {
+                "timestamp": clock.now().to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
+                "level": "ERROR",
+                "message": "explicit message field",
+                "fields": {
+                    "a": 3,
+                },
+                "spans": {
+                    "00":{
+                        "span_id": "0000000000000001",
+                        "span_name": "span1",
+                        "x": 42,
+                    },
+                    "01": {
+                        "span_id": "0000000000000002",
+                        "span_name": "span2",
+                    }
+                },
+                "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
+                "target": "proxy::logging::tests",
+                "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
+                "thread_id": actual.as_object().unwrap().get("thread_id").unwrap().as_number().unwrap(),
+                "thread_name": "logging::tests::test_field_collection",
+            }
+        );
+
+        assert_json_eq!(actual, expected);
+    }
+}
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -88,12 +88,13 @@ fn bench_process_msg(c: &mut Criterion) {
                let (lsn, record) = walgen.next().expect("endless WAL");
                ProposerAcceptorMessage::AppendRequest(AppendRequest {
                    h: AppendRequestHeader {
-                        generation: 0,
                        term: 1,
+                        term_start_lsn: Lsn(0),
                        begin_lsn: lsn,
                        end_lsn: lsn + record.len() as u64,
                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                        truncate_lsn: Lsn(0),
+                        proposer_uuid: [0; 16],
                    },
                    wal_data: record,
                })
@@ -159,12 +160,13 @@ fn bench_wal_acceptor(c: &mut Criterion) {
                    .take(n)
                    .map(|(lsn, record)| AppendRequest {
                        h: AppendRequestHeader {
-                            generation: 0,
                            term: 1,
+                            term_start_lsn: Lsn(0),
                            begin_lsn: lsn,
                            end_lsn: lsn + record.len() as u64,
                            commit_lsn: Lsn(0),
                            truncate_lsn: Lsn(0),
+                            proposer_uuid: [0; 16],
                        },
                        wal_data: record,
                    })
@@ -260,12 +262,13 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
            runtime.block_on(async {
                let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
                    h: AppendRequestHeader {
-                        generation: 0,
                        term: 1,
+                        term_start_lsn: Lsn(0),
                        begin_lsn: lsn,
                        end_lsn: lsn + record.len() as u64,
                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                        truncate_lsn: Lsn(0),
+                        proposer_uuid: [0; 16],
                    },
                    wal_data: record,
                });
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,7 +8,7 @@

 use anyhow::Context;
 use postgres_backend::QueryError;
-use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
    let history = TermHistory(history_entries);

    let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
-        generation: INVALID_GENERATION,
        term,
        start_streaming_at: lsn,
        term_history: history,
+        timeline_start_lsn: lsn,
    });

    tli.process_msg(&proposer_elected_request).await?;
@@ -170,12 +170,13 @@ pub async fn append_logical_message(

    let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
        h: AppendRequestHeader {
-            generation: INVALID_GENERATION,
            term: msg.term,
+            term_start_lsn: begin_lsn,
            begin_lsn,
            end_lsn,
            commit_lsn,
            truncate_lsn: msg.truncate_lsn,
+            proposer_uuid: [0u8; 16],
        },
        wal_data,
    });
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
            tokio::select! {
                // todo: add read|write .context to these errors
                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
-                r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
                _ = timeline_cancel.cancelled() => {
                    return Err(CopyStreamHandlerEnd::Cancelled);
                }
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
        let tli = match next_msg {
            ProposerAcceptorMessage::Greeting(ref greeting) => {
                info!(
-                    "start handshake with walproposer {} sysid {}",
-                    self.peer_addr, greeting.system_id,
+                    "start handshake with walproposer {} sysid {} timeline {}",
+                    self.peer_addr, greeting.system_id, greeting.tli,
                );
                let server_info = ServerInfo {
                    pg_version: greeting.pg_version,
@@ -459,7 +459,6 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
    pgb_writer: &mut PostgresBackend<IO>,
    mut reply_rx: Receiver<AcceptorProposerMessage>,
    mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
-    proto_version: u32,
 ) -> Result<(), CopyStreamHandlerEnd> {
    let mut buf = BytesMut::with_capacity(128);

@@ -497,7 +496,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
        };

        buf.clear();
-        msg.serialize(&mut buf, proto_version)?;
+        msg.serialize(&mut buf)?;
        pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
    }
 }
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -7,7 +7,6 @@ use std::{fmt, pin::pin};
 use anyhow::{bail, Context};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
-use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::{PeerInfo, TimelineStatus};
 use safekeeper_api::Term;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -268,10 +267,7 @@ async fn recover(
    );

    // Now understand our term history.
-    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
-        generation: INVALID_GENERATION,
-        term: donor.term,
-    });
+    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
    let vote_response = match tli
        .process_msg(&vote_request)
        .await
@@ -306,10 +302,10 @@ async fn recover(

    // truncate WAL locally
    let pe = ProposerAcceptorMessage::Elected(ProposerElected {
-        generation: INVALID_GENERATION,
        term: donor.term,
        start_streaming_at: last_common_point.lsn,
        term_history: donor_th,
+        timeline_start_lsn: Lsn::INVALID,
    });
    // Successful ProposerElected handling always returns None. If term changed,
    // we'll find out that during the streaming. Note: it is expected to get
@@ -438,12 +434,13 @@ async fn network_io(
        match msg {
            ReplicationMessage::XLogData(xlog_data) => {
                let ar_hdr = AppendRequestHeader {
-                    generation: INVALID_GENERATION,
                    term: donor.term,
+                    term_start_lsn: Lsn::INVALID, // unused
                    begin_lsn: Lsn(xlog_data.wal_start()),
                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
                    truncate_lsn: Lsn::INVALID, // do not attempt to advance
+                    proposer_uuid: [0; 16],
                };
                let ar = AppendRequest {
                    h: ar_hdr,
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -5,11 +5,6 @@ use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};

 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
-use safekeeper_api::membership;
-use safekeeper_api::membership::Generation;
-use safekeeper_api::membership::MemberSet;
-use safekeeper_api::membership::SafekeeperId;
-use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::HotStandbyFeedback;
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
@@ -17,7 +12,6 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
-use std::str::FromStr;
 use storage_broker::proto::SafekeeperTimelineInfo;

 use tracing::*;
@@ -35,8 +29,7 @@ use utils::{
    lsn::Lsn,
 };

-pub const SK_PROTO_VERSION_2: u32 = 2;
-pub const SK_PROTO_VERSION_3: u32 = 3;
+pub const SK_PROTOCOL_VERSION: u32 = 2;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;

 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -63,28 +56,8 @@ impl TermHistory {
        TermHistory(Vec::new())
    }

-    // Parse TermHistory as n_entries followed by TermLsn pairs in network order.
+    // Parse TermHistory as n_entries followed by TermLsn pairs
    pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
-        let n_entries = bytes
-            .get_u32_f()
-            .with_context(|| "TermHistory misses len")?;
-        let mut res = Vec::with_capacity(n_entries as usize);
-        for i in 0..n_entries {
-            let term = bytes
-                .get_u64_f()
-                .with_context(|| format!("TermHistory pos {} misses term", i))?;
-            let lsn = bytes
-                .get_u64_f()
-                .with_context(|| format!("TermHistory pos {} misses lsn", i))?
-                .into();
-            res.push(TermLsn { term, lsn })
-        }
-        Ok(TermHistory(res))
-    }
-
-    // Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
-    // TODO remove once v2 protocol is fully dropped.
-    pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
        if bytes.remaining() < 4 {
            bail!("TermHistory misses len");
        }
@@ -224,18 +197,6 @@ impl AcceptorState {
 /// Initial Proposer -> Acceptor message
 #[derive(Debug, Deserialize)]
 pub struct ProposerGreeting {
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub mconf: membership::Configuration,
-    /// Postgres server version
-    pub pg_version: u32,
-    pub system_id: SystemId,
-    pub wal_seg_size: u32,
-}
-
-/// V2 of the message; exists as a struct because we (de)serialized it as is.
-#[derive(Debug, Deserialize)]
-pub struct ProposerGreetingV2 {
    /// proposer-acceptor protocol version
    pub protocol_version: u32,
    /// Postgres server version
@@ -252,35 +213,27 @@ pub struct ProposerGreetingV2 {
 /// (acceptor voted for).
 #[derive(Debug, Serialize)]
 pub struct AcceptorGreeting {
-    node_id: NodeId,
-    mconf: membership::Configuration,
    term: u64,
+    node_id: NodeId,
 }

 /// Vote request sent from proposer to safekeepers
-#[derive(Debug)]
-pub struct VoteRequest {
-    pub generation: Generation,
-    pub term: Term,
-}
-
-/// V2 of the message; exists as a struct because we (de)serialized it as is.
 #[derive(Debug, Deserialize)]
-pub struct VoteRequestV2 {
+pub struct VoteRequest {
    pub term: Term,
 }

 /// Vote itself, sent from safekeeper to proposer
 #[derive(Debug, Serialize)]
 pub struct VoteResponse {
-    generation: Generation, // membership conf generation
    pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
-    vote_given: bool,
+    vote_given: u64, // fixme u64 due to padding
    // Safekeeper flush_lsn (end of WAL) + history of term switches allow
    // proposer to choose the most advanced one.
    pub flush_lsn: Lsn,
    truncate_lsn: Lsn,
    pub term_history: TermHistory,
+    timeline_start_lsn: Lsn,
 }

 /*
@@ -289,10 +242,10 @@ pub struct VoteResponse {
 */
 #[derive(Debug)]
 pub struct ProposerElected {
-    pub generation: Generation, // membership conf generation
    pub term: Term,
    pub start_streaming_at: Lsn,
    pub term_history: TermHistory,
+    pub timeline_start_lsn: Lsn,
 }

 /// Request with WAL message sent from proposer to safekeeper. Along the way it
@@ -304,22 +257,6 @@ pub struct AppendRequest {
 }
 #[derive(Debug, Clone, Deserialize)]
 pub struct AppendRequestHeader {
-    pub generation: Generation, // membership conf generation
-    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
-    pub term: Term,
-    /// start position of message in WAL
-    pub begin_lsn: Lsn,
-    /// end position of message in WAL
-    pub end_lsn: Lsn,
-    /// LSN committed by quorum of safekeepers
-    pub commit_lsn: Lsn,
-    /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
-    pub truncate_lsn: Lsn,
-}
-
-/// V2 of the message; exists as a struct because we (de)serialized it as is.
-#[derive(Debug, Clone, Deserialize)]
-pub struct AppendRequestHeaderV2 {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
    // TODO: remove this field from the protocol, it in unused -- LSN of term
@@ -340,9 +277,6 @@ pub struct AppendRequestHeaderV2 {
 /// Report safekeeper state to proposer
 #[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
-    // Membership conf generation. Not strictly required because on mismatch
-    // connection is reset, but let's sanity check it.
-    generation: Generation,
    // Current term of the safekeeper; if it is higher than proposer's, the
    // compute is out of date.
    pub term: Term,
@@ -359,9 +293,8 @@ pub struct AppendResponse {
 }

 impl AppendResponse {
-    fn term_only(generation: Generation, term: Term) -> AppendResponse {
+    fn term_only(term: Term) -> AppendResponse {
        AppendResponse {
-            generation,
            term,
            flush_lsn: Lsn(0),
            commit_lsn: Lsn(0),
@@ -382,316 +315,72 @@ pub enum ProposerAcceptorMessage {
    FlushWAL,
 }

-/// Augment Bytes with fallible get_uN where N is number of bytes methods.
-/// All reads are in network (big endian) order.
-trait BytesF {
-    fn get_u8_f(&mut self) -> Result<u8>;
-    fn get_u16_f(&mut self) -> Result<u16>;
-    fn get_u32_f(&mut self) -> Result<u32>;
-    fn get_u64_f(&mut self) -> Result<u64>;
-}
-
-impl BytesF for Bytes {
-    fn get_u8_f(&mut self) -> Result<u8> {
-        if self.is_empty() {
-            bail!("no bytes left, expected 1");
-        }
-        Ok(self.get_u8())
-    }
-    fn get_u16_f(&mut self) -> Result<u16> {
-        if self.remaining() < 2 {
-            bail!("no bytes left, expected 2");
-        }
-        Ok(self.get_u16())
-    }
-    fn get_u32_f(&mut self) -> Result<u32> {
-        if self.remaining() < 4 {
-            bail!("only {} bytes left, expected 4", self.remaining());
-        }
-        Ok(self.get_u32())
-    }
-    fn get_u64_f(&mut self) -> Result<u64> {
-        if self.remaining() < 8 {
-            bail!("only {} bytes left, expected 8", self.remaining());
-        }
-        Ok(self.get_u64())
-    }
-}
-
 impl ProposerAcceptorMessage {
-    /// Read cstring from Bytes.
-    fn get_cstr(buf: &mut Bytes) -> Result<String> {
-        let pos = buf
-            .iter()
-            .position(|x| *x == 0)
-            .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
-        let result = buf.split_to(pos);
-        buf.advance(1); // drop the null terminator
-        match std::str::from_utf8(&result) {
-            Ok(s) => Ok(s.to_string()),
-            Err(e) => bail!("invalid utf8 in cstring: {}", e),
-        }
-    }
-
-    /// Read membership::Configuration from Bytes.
-    fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
-        let generation = buf.get_u32_f().with_context(|| "reading generation")?;
-        let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
-        // Main member set must have at least someone in valid configuration.
-        // Empty conf is allowed until we fully migrate.
-        if generation != INVALID_GENERATION && members_len == 0 {
-            bail!("empty members_len");
-        }
-        let mut members = MemberSet::empty();
-        for i in 0..members_len {
-            let id = buf
-                .get_u64_f()
-                .with_context(|| format!("reading member {} node_id", i))?;
-            let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
-            let pg_port = buf
-                .get_u16_f()
-                .with_context(|| format!("reading member {} port", i))?;
-            let sk = SafekeeperId {
-                id: NodeId(id),
-                host,
-                pg_port,
-            };
-            members.add(sk)?;
-        }
-        let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
-        // Non joint conf.
-        if new_members_len == 0 {
-            Ok(membership::Configuration {
-                generation,
-                members,
-                new_members: None,
-            })
-        } else {
-            let mut new_members = MemberSet::empty();
-            for i in 0..new_members_len {
-                let id = buf
-                    .get_u64_f()
-                    .with_context(|| format!("reading new member {} node_id", i))?;
-                let host = Self::get_cstr(buf)
-                    .with_context(|| format!("reading new member {} host", i))?;
-                let pg_port = buf
-                    .get_u16_f()
-                    .with_context(|| format!("reading new member {} port", i))?;
-                let sk = SafekeeperId {
-                    id: NodeId(id),
-                    host,
-                    pg_port,
-                };
-                new_members.add(sk)?;
-            }
-            Ok(membership::Configuration {
-                generation,
-                members,
-                new_members: Some(new_members),
-            })
-        }
-    }
-
    /// Parse proposer message.
-    pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
-        if proto_version == SK_PROTO_VERSION_3 {
-            if msg_bytes.is_empty() {
-                bail!("ProposerAcceptorMessage is not complete: missing tag");
+    pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
+        if proto_version != SK_PROTOCOL_VERSION {
+            bail!(
+                "incompatible protocol version {}, expected {}",
+                proto_version,
+                SK_PROTOCOL_VERSION
+            );
+        }
+        // xxx using Reader is inefficient but easy to work with bincode
+        let mut stream = msg_bytes.reader();
+        // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
+        let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
+        match tag {
+            'g' => {
+                let msg = ProposerGreeting::des_from(&mut stream)?;
+                Ok(ProposerAcceptorMessage::Greeting(msg))
            }
-            let tag = msg_bytes.get_u8_f().with_context(|| {
-                "ProposerAcceptorMessage is not complete: missing tag".to_string()
-            })? as char;
-            match tag {
-                'g' => {
-                    let tenant_id_str =
-                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
-                    let tenant_id = TenantId::from_str(&tenant_id_str)?;
-                    let timeline_id_str =
-                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
-                    let timeline_id = TimelineId::from_str(&timeline_id_str)?;
-                    let mconf = Self::get_mconf(&mut msg_bytes)?;
-                    let pg_version = msg_bytes
-                        .get_u32_f()
-                        .with_context(|| "reading pg_version")?;
-                    let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
-                    let wal_seg_size = msg_bytes
-                        .get_u32_f()
-                        .with_context(|| "reading wal_seg_size")?;
-                    let g = ProposerGreeting {
-                        tenant_id,
-                        timeline_id,
-                        mconf,
-                        pg_version,
-                        system_id,
-                        wal_seg_size,
-                    };
-                    Ok(ProposerAcceptorMessage::Greeting(g))
-                }
-                'v' => {
-                    let generation = msg_bytes
-                        .get_u32_f()
-                        .with_context(|| "reading generation")?;
-                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
-                    let v = VoteRequest { generation, term };
-                    Ok(ProposerAcceptorMessage::VoteRequest(v))
-                }
-                'e' => {
-                    let generation = msg_bytes
-                        .get_u32_f()
-                        .with_context(|| "reading generation")?;
-                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
-                    let start_streaming_at: Lsn = msg_bytes
-                        .get_u64_f()
-                        .with_context(|| "reading start_streaming_at")?
-                        .into();
-                    let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
-                    let msg = ProposerElected {
-                        generation,
-                        term,
-                        start_streaming_at,
-                        term_history,
-                    };
-                    Ok(ProposerAcceptorMessage::Elected(msg))
-                }
-                'a' => {
-                    let generation = msg_bytes
-                        .get_u32_f()
-                        .with_context(|| "reading generation")?;
-                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
-                    let begin_lsn: Lsn = msg_bytes
-                        .get_u64_f()
-                        .with_context(|| "reading begin_lsn")?
-                        .into();
-                    let end_lsn: Lsn = msg_bytes
-                        .get_u64_f()
-                        .with_context(|| "reading end_lsn")?
-                        .into();
-                    let commit_lsn: Lsn = msg_bytes
-                        .get_u64_f()
-                        .with_context(|| "reading commit_lsn")?
-                        .into();
-                    let truncate_lsn: Lsn = msg_bytes
-                        .get_u64_f()
-                        .with_context(|| "reading truncate_lsn")?
-                        .into();
-                    let hdr = AppendRequestHeader {
-                        generation,
-                        term,
-                        begin_lsn,
-                        end_lsn,
-                        commit_lsn,
-                        truncate_lsn,
-                    };
-                    let rec_size = hdr
-                        .end_lsn
-                        .checked_sub(hdr.begin_lsn)
-                        .context("begin_lsn > end_lsn in AppendRequest")?
-                        .0 as usize;
-                    if rec_size > MAX_SEND_SIZE {
-                        bail!(
-                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
-                            MAX_SEND_SIZE
-                        );
-                    }
-                    if msg_bytes.remaining() < rec_size {
-                        bail!(
-                            "reading WAL: only {} bytes left, wanted {}",
-                            msg_bytes.remaining(),
-                            rec_size
-                        );
-                    }
-                    let wal_data = msg_bytes.copy_to_bytes(rec_size);
-                    let msg = AppendRequest { h: hdr, wal_data };
-
-                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
-                }
-                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+            'v' => {
+                let msg = VoteRequest::des_from(&mut stream)?;
+                Ok(ProposerAcceptorMessage::VoteRequest(msg))
            }
-        } else if proto_version == SK_PROTO_VERSION_2 {
-            // xxx using Reader is inefficient but easy to work with bincode
-            let mut stream = msg_bytes.reader();
-            // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
-            let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
-            match tag {
-                'g' => {
-                    let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
-                    let g = ProposerGreeting {
-                        tenant_id: msgv2.tenant_id,
-                        timeline_id: msgv2.timeline_id,
-                        mconf: membership::Configuration {
-                            generation: INVALID_GENERATION,
-                            members: MemberSet::empty(),
-                            new_members: None,
-                        },
-                        pg_version: msgv2.pg_version,
-                        system_id: msgv2.system_id,
-                        wal_seg_size: msgv2.wal_seg_size,
-                    };
-                    Ok(ProposerAcceptorMessage::Greeting(g))
+            'e' => {
+                let mut msg_bytes = stream.into_inner();
+                if msg_bytes.remaining() < 16 {
+                    bail!("ProposerElected message is not complete");
                }
-                'v' => {
-                    let msg = VoteRequestV2::des_from(&mut stream)?;
-                    let v = VoteRequest {
-                        generation: INVALID_GENERATION,
-                        term: msg.term,
-                    };
-                    Ok(ProposerAcceptorMessage::VoteRequest(v))
+                let term = msg_bytes.get_u64_le();
+                let start_streaming_at = msg_bytes.get_u64_le().into();
+                let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
+                if msg_bytes.remaining() < 8 {
+                    bail!("ProposerElected message is not complete");
                }
-                'e' => {
-                    let mut msg_bytes = stream.into_inner();
-                    if msg_bytes.remaining() < 16 {
-                        bail!("ProposerElected message is not complete");
-                    }
-                    let term = msg_bytes.get_u64_le();
-                    let start_streaming_at = msg_bytes.get_u64_le().into();
-                    let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
-                    if msg_bytes.remaining() < 8 {
-                        bail!("ProposerElected message is not complete");
-                    }
-                    let _timeline_start_lsn = msg_bytes.get_u64_le();
-                    let msg = ProposerElected {
-                        generation: INVALID_GENERATION,
-                        term,
-                        start_streaming_at,
-                        term_history,
-                    };
-                    Ok(ProposerAcceptorMessage::Elected(msg))
-                }
-                'a' => {
-                    // read header followed by wal data
-                    let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
-                    let hdr = AppendRequestHeader {
-                        generation: INVALID_GENERATION,
-                        term: hdrv2.term,
-                        begin_lsn: hdrv2.begin_lsn,
-                        end_lsn: hdrv2.end_lsn,
-                        commit_lsn: hdrv2.commit_lsn,
-                        truncate_lsn: hdrv2.truncate_lsn,
-                    };
-                    let rec_size = hdr
-                        .end_lsn
-                        .checked_sub(hdr.begin_lsn)
-                        .context("begin_lsn > end_lsn in AppendRequest")?
-                        .0 as usize;
-                    if rec_size > MAX_SEND_SIZE {
-                        bail!(
-                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
-                            MAX_SEND_SIZE
-                        );
-                    }
-
-                    let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
-                    stream.read_exact(&mut wal_data_vec)?;
-                    let wal_data = Bytes::from(wal_data_vec);
-
-                    let msg = AppendRequest { h: hdr, wal_data };
-
-                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
-                }
-                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+                let timeline_start_lsn = msg_bytes.get_u64_le().into();
+                let msg = ProposerElected {
+                    term,
+                    start_streaming_at,
+                    timeline_start_lsn,
+                    term_history,
+                };
+                Ok(ProposerAcceptorMessage::Elected(msg))
            }
-        } else {
-            bail!("unsupported protocol version {}", proto_version);
+            'a' => {
+                // read header followed by wal data
+                let hdr = AppendRequestHeader::des_from(&mut stream)?;
+                let rec_size = hdr
+                    .end_lsn
+                    .checked_sub(hdr.begin_lsn)
+                    .context("begin_lsn > end_lsn in AppendRequest")?
+                    .0 as usize;
+                if rec_size > MAX_SEND_SIZE {
+                    bail!(
+                        "AppendRequest is longer than MAX_SEND_SIZE ({})",
+                        MAX_SEND_SIZE
+                    );
+                }
+
+                let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
+                stream.read_exact(&mut wal_data_vec)?;
+                let wal_data = Bytes::from(wal_data_vec);
+                let msg = AppendRequest { h: hdr, wal_data };
+
+                Ok(ProposerAcceptorMessage::AppendRequest(msg))
+            }
+            _ => bail!("unknown proposer-acceptor message tag: {}", tag),
        }
    }

@@ -705,21 +394,36 @@ impl ProposerAcceptorMessage {
        // We explicitly list all fields, to draw attention here when new fields are added.
        let mut size = BASE_SIZE;
        size += match self {
-            Self::Greeting(_) => 0,
+            Self::Greeting(ProposerGreeting {
+                protocol_version: _,
+                pg_version: _,
+                proposer_id: _,
+                system_id: _,
+                timeline_id: _,
+                tenant_id: _,
+                tli: _,
+                wal_seg_size: _,
+            }) => 0,

-            Self::VoteRequest(_) => 0,
+            Self::VoteRequest(VoteRequest { term: _ }) => 0,

-            Self::Elected(_) => 0,
+            Self::Elected(ProposerElected {
+                term: _,
+                start_streaming_at: _,
+                term_history: _,
+                timeline_start_lsn: _,
+            }) => 0,

            Self::AppendRequest(AppendRequest {
                h:
                    AppendRequestHeader {
-                        generation: _,
                        term: _,
+                        term_start_lsn: _,
                        begin_lsn: _,
                        end_lsn: _,
                        commit_lsn: _,
                        truncate_lsn: _,
+                        proposer_uuid: _,
                    },
                wal_data,
            }) => wal_data.len(),
@@ -727,12 +431,13 @@ impl ProposerAcceptorMessage {
            Self::NoFlushAppendRequest(AppendRequest {
                h:
                    AppendRequestHeader {
-                        generation: _,
                        term: _,
+                        term_start_lsn: _,
                        begin_lsn: _,
                        end_lsn: _,
                        commit_lsn: _,
                        truncate_lsn: _,
+                        proposer_uuid: _,
                    },
                wal_data,
            }) => wal_data.len(),
@@ -753,118 +458,45 @@ pub enum AcceptorProposerMessage {
 }

 impl AcceptorProposerMessage {
-    fn put_cstr(buf: &mut BytesMut, s: &str) {
-        buf.put_slice(s.as_bytes());
-        buf.put_u8(0); // null terminator
-    }
-
-    /// Serialize membership::Configuration into buf.
-    fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
-        buf.put_u32(mconf.generation);
-        buf.put_u32(mconf.members.m.len() as u32);
-        for sk in &mconf.members.m {
-            buf.put_u64(sk.id.0);
-            Self::put_cstr(buf, &sk.host);
-            buf.put_u16(sk.pg_port);
-        }
-        if let Some(ref new_members) = mconf.new_members {
-            buf.put_u32(new_members.m.len() as u32);
-            for sk in &new_members.m {
-                buf.put_u64(sk.id.0);
-                Self::put_cstr(buf, &sk.host);
-                buf.put_u16(sk.pg_port);
-            }
-        } else {
-            buf.put_u32(0);
-        }
-    }
-
    /// Serialize acceptor -> proposer message.
-    pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
-        if proto_version == SK_PROTO_VERSION_3 {
-            match self {
-                AcceptorProposerMessage::Greeting(msg) => {
-                    buf.put_u8(b'g');
-                    buf.put_u64(msg.node_id.0);
-                    Self::serialize_mconf(buf, &msg.mconf);
-                    buf.put_u64(msg.term)
+    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
+        match self {
+            AcceptorProposerMessage::Greeting(msg) => {
+                buf.put_u64_le('g' as u64);
+                buf.put_u64_le(msg.term);
+                buf.put_u64_le(msg.node_id.0);
+            }
+            AcceptorProposerMessage::VoteResponse(msg) => {
+                buf.put_u64_le('v' as u64);
+                buf.put_u64_le(msg.term);
+                buf.put_u64_le(msg.vote_given);
+                buf.put_u64_le(msg.flush_lsn.into());
+                buf.put_u64_le(msg.truncate_lsn.into());
+                buf.put_u32_le(msg.term_history.0.len() as u32);
+                for e in &msg.term_history.0 {
+                    buf.put_u64_le(e.term);
+                    buf.put_u64_le(e.lsn.into());
                }
-                AcceptorProposerMessage::VoteResponse(msg) => {
-                    buf.put_u8(b'v');
-                    buf.put_u32(msg.generation);
-                    buf.put_u64(msg.term);
-                    buf.put_u8(msg.vote_given as u8);
-                    buf.put_u64(msg.flush_lsn.into());
-                    buf.put_u64(msg.truncate_lsn.into());
-                    buf.put_u32(msg.term_history.0.len() as u32);
-                    for e in &msg.term_history.0 {
-                        buf.put_u64(e.term);
-                        buf.put_u64(e.lsn.into());
-                    }
-                }
-                AcceptorProposerMessage::AppendResponse(msg) => {
-                    buf.put_u8(b'a');
-                    buf.put_u32(msg.generation);
-                    buf.put_u64(msg.term);
-                    buf.put_u64(msg.flush_lsn.into());
-                    buf.put_u64(msg.commit_lsn.into());
-                    buf.put_i64(msg.hs_feedback.ts);
-                    buf.put_u64(msg.hs_feedback.xmin);
-                    buf.put_u64(msg.hs_feedback.catalog_xmin);
+                buf.put_u64_le(msg.timeline_start_lsn.into());
+            }
+            AcceptorProposerMessage::AppendResponse(msg) => {
+                buf.put_u64_le('a' as u64);
+                buf.put_u64_le(msg.term);
+                buf.put_u64_le(msg.flush_lsn.into());
+                buf.put_u64_le(msg.commit_lsn.into());
+                buf.put_i64_le(msg.hs_feedback.ts);
+                buf.put_u64_le(msg.hs_feedback.xmin);
+                buf.put_u64_le(msg.hs_feedback.catalog_xmin);

-                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
-                    // if it is not present.
-                    if let Some(ref msg) = msg.pageserver_feedback {
-                        msg.serialize(buf);
-                    }
+                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                // if it is not present.
+                if let Some(ref msg) = msg.pageserver_feedback {
+                    msg.serialize(buf);
                }
            }
-            Ok(())
-        // TODO remove 3 after converting all msgs
-        } else if proto_version == SK_PROTO_VERSION_2 {
-            match self {
-                AcceptorProposerMessage::Greeting(msg) => {
-                    buf.put_u64_le('g' as u64);
-                    // v2 didn't have mconf and fields were reordered
-                    buf.put_u64_le(msg.term);
-                    buf.put_u64_le(msg.node_id.0);
-                }
-                AcceptorProposerMessage::VoteResponse(msg) => {
-                    // v2 didn't have generation, had u64 vote_given and timeline_start_lsn
-                    buf.put_u64_le('v' as u64);
-                    buf.put_u64_le(msg.term);
-                    buf.put_u64_le(msg.vote_given as u64);
-                    buf.put_u64_le(msg.flush_lsn.into());
-                    buf.put_u64_le(msg.truncate_lsn.into());
-                    buf.put_u32_le(msg.term_history.0.len() as u32);
-                    for e in &msg.term_history.0 {
-                        buf.put_u64_le(e.term);
-                        buf.put_u64_le(e.lsn.into());
-                    }
-                    // removed timeline_start_lsn
-                    buf.put_u64_le(0);
-                }
-                AcceptorProposerMessage::AppendResponse(msg) => {
-                    // v2 didn't have generation
-                    buf.put_u64_le('a' as u64);
-                    buf.put_u64_le(msg.term);
-                    buf.put_u64_le(msg.flush_lsn.into());
-                    buf.put_u64_le(msg.commit_lsn.into());
-                    buf.put_i64_le(msg.hs_feedback.ts);
-                    buf.put_u64_le(msg.hs_feedback.xmin);
-                    buf.put_u64_le(msg.hs_feedback.catalog_xmin);
-
-                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
-                    // if it is not present.
-                    if let Some(ref msg) = msg.pageserver_feedback {
-                        msg.serialize(buf);
-                    }
-                }
-            }
-            Ok(())
-        } else {
-            bail!("unsupported protocol version {}", proto_version);
        }
+
+        Ok(())
    }
 }

@@ -961,6 +593,14 @@ where
        &mut self,
        msg: &ProposerGreeting,
    ) -> Result<Option<AcceptorProposerMessage>> {
+        // Check protocol compatibility
+        if msg.protocol_version != SK_PROTOCOL_VERSION {
+            bail!(
+                "incompatible protocol version {}, expected {}",
+                msg.protocol_version,
+                SK_PROTOCOL_VERSION
+            );
+        }
        /* Postgres major version mismatch is treated as fatal error
         * because safekeepers parse WAL headers and the format
         * may change between versions.
@@ -1015,16 +655,15 @@ where
            self.state.finish_change(&state).await?;
        }

-        let apg = AcceptorGreeting {
-            node_id: self.node_id,
-            mconf: self.state.mconf.clone(),
-            term: self.state.acceptor_state.term,
-        };
        info!(
-            "processed greeting {:?} from walproposer, sending {:?}",
-            msg, apg
+            "processed greeting from walproposer {}, sending term {:?}",
+            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
+            self.state.acceptor_state.term
        );
-        Ok(Some(AcceptorProposerMessage::Greeting(apg)))
+        Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
+            term: self.state.acceptor_state.term,
+            node_id: self.node_id,
+        })))
    }

    /// Give vote for the given term, if we haven't done that previously.
@@ -1045,12 +684,12 @@ where
        self.wal_store.flush_wal().await?;
        // initialize with refusal
        let mut resp = VoteResponse {
-            generation: self.state.mconf.generation,
            term: self.state.acceptor_state.term,
-            vote_given: false,
+            vote_given: false as u64,
            flush_lsn: self.flush_lsn(),
            truncate_lsn: self.state.inmem.peer_horizon_lsn,
            term_history: self.get_term_history(),
+            timeline_start_lsn: self.state.timeline_start_lsn,
        };
        if self.state.acceptor_state.term < msg.term {
            let mut state = self.state.start_change();
@@ -1059,16 +698,15 @@ where
            self.state.finish_change(&state).await?;

            resp.term = self.state.acceptor_state.term;
-            resp.vote_given = true;
+            resp.vote_given = true as u64;
        }
-        info!("processed {:?}: sending {:?}", msg, &resp);
+        info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
        Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
    }

    /// Form AppendResponse from current state.
    fn append_response(&self) -> AppendResponse {
        let ar = AppendResponse {
-            generation: self.state.mconf.generation,
            term: self.state.acceptor_state.term,
            flush_lsn: self.flush_lsn(),
            commit_lsn: self.state.commit_lsn,
@@ -1167,22 +805,18 @@ where
            // Here we learn initial LSN for the first time, set fields
            // interested in that.

-            if let Some(start_lsn) = msg.term_history.0.first() {
-                if state.timeline_start_lsn == Lsn(0) {
-                    // Remember point where WAL begins globally. In the future it
-                    // will be intialized immediately on timeline creation.
-                    state.timeline_start_lsn = start_lsn.lsn;
-                    info!(
-                        "setting timeline_start_lsn to {:?}",
-                        state.timeline_start_lsn
-                    );
-                }
+            if state.timeline_start_lsn == Lsn(0) {
+                // Remember point where WAL begins globally.
+                state.timeline_start_lsn = msg.timeline_start_lsn;
+                info!(
+                    "setting timeline_start_lsn to {:?}",
+                    state.timeline_start_lsn
+                );
            }
-
            if state.peer_horizon_lsn == Lsn(0) {
                // Update peer_horizon_lsn as soon as we know where timeline starts.
                // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
-                state.peer_horizon_lsn = state.timeline_start_lsn;
+                state.peer_horizon_lsn = msg.timeline_start_lsn;
            }
            if state.local_start_lsn == Lsn(0) {
                state.local_start_lsn = msg.start_streaming_at;
@@ -1262,10 +896,7 @@ where

        // If our term is higher, immediately refuse the message.
        if self.state.acceptor_state.term > msg.h.term {
-            let resp = AppendResponse::term_only(
-                self.state.mconf.generation,
-                self.state.acceptor_state.term,
-            );
+            let resp = AppendResponse::term_only(self.state.acceptor_state.term);
            return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
        }

@@ -1293,8 +924,10 @@ where
            );
        }

-        // Now we know that we are in the same term as the proposer, process the
-        // message.
+        // Now we know that we are in the same term as the proposer,
+        // processing the message.
+
+        self.state.inmem.proposer_uuid = msg.h.proposer_uuid;

        // do the job
        if !msg.wal_data.is_empty() {
@@ -1464,13 +1097,10 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        // check voting for 1 is ok
-        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
-            generation: 0,
-            term: 1,
-        });
+        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
        let mut vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
            r => panic!("unexpected response: {:?}", r),
        }

@@ -1485,7 +1115,7 @@ mod tests {
        // and ensure voting second time for 1 is not ok
        vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
            r => panic!("unexpected response: {:?}", r),
        }
    }
@@ -1500,12 +1130,13 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        let mut ar_hdr = AppendRequestHeader {
-            generation: 0,
            term: 2,
+            term_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
            truncate_lsn: Lsn(0),
+            proposer_uuid: [0; 16],
        };
        let mut append_request = AppendRequest {
            h: ar_hdr.clone(),
@@ -1513,7 +1144,6 @@ mod tests {
        };

        let pem = ProposerElected {
-            generation: 0,
            term: 2,
            start_streaming_at: Lsn(1),
            term_history: TermHistory(vec![
@@ -1526,6 +1156,7 @@ mod tests {
                    lsn: Lsn(3),
                },
            ]),
+            timeline_start_lsn: Lsn(1),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
            .await
@@ -1560,25 +1191,26 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        let pem = ProposerElected {
-            generation: 0,
            term: 1,
            start_streaming_at: Lsn(1),
            term_history: TermHistory(vec![TermLsn {
                term: 1,
                lsn: Lsn(1),
            }]),
+            timeline_start_lsn: Lsn(1),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
            .await
            .unwrap();

        let ar_hdr = AppendRequestHeader {
-            generation: 0,
            term: 1,
+            term_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
            truncate_lsn: Lsn(0),
+            proposer_uuid: [0; 16],
        };
        let append_request = AppendRequest {
            h: ar_hdr.clone(),
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -120,6 +120,20 @@ pub enum InterpretedWalReaderError {
    WalStreamClosed,
 }

+enum CurrentPositionUpdate {
+    Reset(Lsn),
+    NotReset(Lsn),
+}
+
+impl CurrentPositionUpdate {
+    fn current_position(&self) -> Lsn {
+        match self {
+            CurrentPositionUpdate::Reset(lsn) => *lsn,
+            CurrentPositionUpdate::NotReset(lsn) => *lsn,
+        }
+    }
+}
+
 impl InterpretedWalReaderState {
    fn current_position(&self) -> Option<Lsn> {
        match self {
@@ -129,6 +143,26 @@ impl InterpretedWalReaderState {
            InterpretedWalReaderState::Done => None,
        }
    }
+
+    // Reset the current position of the WAL reader if the requested starting position
+    // of the new shard is smaller than the current value.
+    fn maybe_reset(&mut self, new_shard_start_pos: Lsn) -> CurrentPositionUpdate {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => {
+                if new_shard_start_pos < *current_position {
+                    *current_position = new_shard_start_pos;
+                    CurrentPositionUpdate::Reset(*current_position)
+                } else {
+                    CurrentPositionUpdate::NotReset(*current_position)
+                }
+            }
+            InterpretedWalReaderState::Done => {
+                panic!("maybe_reset called on finished reader")
+            }
+        }
+    }
 }

 pub(crate) struct AttachShardNotification {
@@ -410,15 +444,24 @@ impl InterpretedWalReader {
                        };

                        senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
-                        let current_pos = self.state.read().unwrap().current_position().unwrap();
-                        if start_pos < current_pos {
-                            self.wal_stream.reset(start_pos).await;
-                            wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
-                        }
+
+                        // If the shard is subscribing below the current position the we need
+                        // to update the cursor that tracks where we are at in the WAL
+                        // ([`Self::state`]) and reset the WAL stream itself
+                        // (`[Self::wal_stream`]). This must be done atomically from the POV of
+                        // anything outside the select statement.
+                        let position_reset = self.state.write().unwrap().maybe_reset(start_pos);
+                        match position_reset {
+                            CurrentPositionUpdate::Reset(to) => {
+                                self.wal_stream.reset(to).await;
+                                wal_decoder = WalStreamDecoder::new(to, self.pg_version);
+                            },
+                            CurrentPositionUpdate::NotReset(_) => {}
+                        };

                        tracing::info!(
                            "Added shard sender {} with start_pos={} current_pos={}",
-                            ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
+                            ShardSenderId::new(shard_id, new_sender_id), start_pos, position_reset.current_position()
                        );
                    }
                }
@@ -584,7 +627,7 @@ mod tests {
            .unwrap();

        let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
            .await
            .unwrap();
        let end_pos = end_watch.get();
@@ -715,7 +758,6 @@ mod tests {
        const MSG_COUNT: usize = 200;
        const PG_VERSION: u32 = 17;
        const SHARD_COUNT: u8 = 2;
-        const ATTACHED_SHARDS: u8 = 4;

        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
        let env = Env::new(true).unwrap();
@@ -725,9 +767,11 @@ mod tests {
            .unwrap();

        let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
-            .await
-            .unwrap();
+        let mut next_record_lsns = Vec::default();
+        let end_watch =
+            Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
+                .await
+                .unwrap();
        let end_pos = end_watch.get();

        let streaming_wal_reader = StreamingWalReader::new(
@@ -746,38 +790,71 @@ mod tests {
        )
        .unwrap();

-        let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-        let mut batch_receivers = vec![rx];
+        struct Sender {
+            tx: Option<tokio::sync::mpsc::Sender<Batch>>,
+            rx: tokio::sync::mpsc::Receiver<Batch>,
+            shard: ShardIdentity,
+            start_lsn: Lsn,
+            received_next_record_lsns: Vec<Lsn>,
+        }

+        impl Sender {
+            fn new(start_lsn: Lsn, shard: ShardIdentity) -> Self {
+                let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+                Self {
+                    tx: Some(tx),
+                    rx,
+                    shard,
+                    start_lsn,
+                    received_next_record_lsns: Vec::default(),
+                }
+            }
+        }
+
+        assert!(next_record_lsns.len() > 7);
+        let start_lsns = vec![
+            next_record_lsns[5],
+            next_record_lsns[1],
+            next_record_lsns[3],
+        ];
+        let mut senders = start_lsns
+            .into_iter()
+            .map(|lsn| Sender::new(lsn, shard_0))
+            .collect::<Vec<_>>();
+
+        let first_sender = senders.first_mut().unwrap();
        let handle = InterpretedWalReader::spawn(
            streaming_wal_reader,
-            start_lsn,
-            tx,
-            shard_0,
+            first_sender.start_lsn,
+            first_sender.tx.take().unwrap(),
+            first_sender.shard,
            PG_VERSION,
            &Some("pageserver".to_string()),
        );

-        for _ in 0..(ATTACHED_SHARDS - 1) {
-            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
-            handle.fanout(shard_0, tx, start_lsn).unwrap();
-            batch_receivers.push(rx);
+        for sender in senders.iter_mut().skip(1) {
+            handle
+                .fanout(sender.shard, sender.tx.take().unwrap(), sender.start_lsn)
+                .unwrap();
        }

-        loop {
-            let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
-            for rx in batch_receivers.iter_mut().skip(1) {
-                let other_batch = rx.recv().await.unwrap();
-
-                assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
-                assert_eq!(
-                    batch.available_wal_end_lsn,
-                    other_batch.available_wal_end_lsn
+        for sender in senders.iter_mut() {
+            loop {
+                let batch = sender.rx.recv().await.unwrap();
+                tracing::info!(
+                    "Sender with start_lsn={} received batch ending at {} with {} records",
+                    sender.start_lsn,
+                    batch.wal_end_lsn,
+                    batch.records.records.len()
                );
-            }

-            if batch.wal_end_lsn == batch.available_wal_end_lsn {
-                break;
+                for rec in batch.records.records {
+                    sender.received_next_record_lsns.push(rec.next_record_lsn);
+                }
+
+                if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                    break;
+                }
            }
        }

@@ -792,5 +869,20 @@ mod tests {
        }

        assert!(done);
+
+        for sender in senders {
+            tracing::info!(
+                "Validating records received by sender with start_lsn={}",
+                sender.start_lsn
+            );
+
+            assert!(sender.received_next_record_lsns.is_sorted());
+            let expected = next_record_lsns
+                .iter()
+                .filter(|lsn| **lsn > sender.start_lsn)
+                .copied()
+                .collect::<Vec<_>>();
+            assert_eq!(sender.received_next_record_lsns, expected);
+        }
    }
 }
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -73,10 +73,10 @@ impl Env {
        // Emulate an initial election.
        safekeeper
            .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
-                generation: 0,
                term: 1,
                start_streaming_at: start_lsn,
                term_history: TermHistory(vec![(1, start_lsn).into()]),
+                timeline_start_lsn: start_lsn,
            }))
            .await?;

@@ -122,6 +122,7 @@ impl Env {
        start_lsn: Lsn,
        msg_size: usize,
        msg_count: usize,
+        mut next_record_lsns: Option<&mut Vec<Lsn>>,
    ) -> anyhow::Result<EndWatch> {
        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
@@ -130,7 +131,7 @@ impl Env {

        WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));

-        let prefix = c"p";
+        let prefix = c"neon-file:";
        let prefixlen = prefix.to_bytes_with_nul().len();
        assert!(msg_size >= prefixlen);
        let message = vec![0; msg_size - prefixlen];
@@ -139,15 +140,19 @@ impl Env {
            &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
        for _ in 0..msg_count {
            let (lsn, record) = walgen.next().unwrap();
+            if let Some(ref mut lsns) = next_record_lsns {
+                lsns.push(lsn);
+            }

            let req = AppendRequest {
                h: AppendRequestHeader {
-                    generation: 0,
                    term: 1,
+                    term_start_lsn: start_lsn,
                    begin_lsn: lsn,
                    end_lsn: lsn + record.len() as u64,
                    commit_lsn: lsn,
                    truncate_lsn: Lsn(0),
+                    proposer_uuid: [0; 16],
                },
                wal_data: record,
            };
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -246,7 +246,7 @@ mod tests {
            .unwrap();

        let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
            .await
            .unwrap();
        let end_pos = end_watch.get();
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,7 +15,9 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, SK_PROTO_VERSION_3, UNKNOWN_SERVER_VERSION},
+    safekeeper::{
+        ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
+    },
    state::{TimelinePersistentState, TimelineState},
    timeline::TimelineError,
    wal_storage::Storage,
@@ -285,7 +287,7 @@ impl ConnState {
                bail!("finished processing START_REPLICATION")
            }

-            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTO_VERSION_3)?;
+            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
            debug!("got msg: {:?}", msg);
            self.process(msg, global)
        } else {
@@ -401,7 +403,7 @@ impl ConnState {
            // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn

            let mut buf = BytesMut::with_capacity(128);
-            reply.serialize(&mut buf, SK_PROTO_VERSION_3)?;
+            reply.serialize(&mut buf)?;

            self.tcp.send(AnyMessage::Bytes(buf.into()));
        }
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
    // We shutdown while sending
    #[error("Shutting down")]
    ShuttingDown,
-    // A response indicates we will never succeed, such as 400 or 404
+    // A response indicates we will never succeed, such as 400 or 403
    #[error("Non-retryable error {0}")]
    Fatal(StatusCode),

--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -27,7 +27,7 @@ use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use rustls::client::danger::ServerCertVerifier;
+use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
 use rustls::client::WebPkiServerVerifier;
 use rustls::crypto::ring;
 use scoped_futures::ScopedBoxFuture;
@@ -194,6 +194,8 @@ impl Persistence {
        timeout: Duration,
    ) -> Result<(), diesel::ConnectionError> {
        let started_at = Instant::now();
+        log_postgres_connstr_info(database_url)
+            .map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
        loop {
            match establish_connection_rustls(database_url).await {
                Ok(_) => {
@@ -1281,6 +1283,51 @@ pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
    Ok(Arc::new(store))
 }

+#[derive(Debug)]
+/// A verifier that accepts all certificates (but logs an error still)
+struct AcceptAll(Arc<WebPkiServerVerifier>);
+impl ServerCertVerifier for AcceptAll {
+    fn verify_server_cert(
+        &self,
+        end_entity: &rustls::pki_types::CertificateDer<'_>,
+        intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        server_name: &rustls::pki_types::ServerName<'_>,
+        ocsp_response: &[u8],
+        now: rustls::pki_types::UnixTime,
+    ) -> Result<ServerCertVerified, rustls::Error> {
+        let r =
+            self.0
+                .verify_server_cert(end_entity, intermediates, server_name, ocsp_response, now);
+        if let Err(err) = r {
+            tracing::info!(
+                ?server_name,
+                "ignoring db connection TLS validation error: {err:?}"
+            );
+            return Ok(ServerCertVerified::assertion());
+        }
+        r
+    }
+    fn verify_tls12_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls12_signature(message, cert, dss)
+    }
+    fn verify_tls13_signature(
+        &self,
+        message: &[u8],
+        cert: &rustls::pki_types::CertificateDer<'_>,
+        dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        self.0.verify_tls13_signature(message, cert, dss)
+    }
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        self.0.supported_verify_schemes()
+    }
+}
+
 /// Loads the root certificates and constructs a client config suitable for connecting.
 /// This function is blocking.
 fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
@@ -1290,76 +1337,12 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
            .expect("ring should support the default protocol versions");
    static DO_CERT_CHECKS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
    let do_cert_checks =
-        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_CERT_CHECKS").is_ok());
+        DO_CERT_CHECKS.get_or_init(|| std::env::var("STORCON_DB_CERT_CHECKS").is_ok());
    Ok(if *do_cert_checks {
        client_config
            .with_root_certificates(load_certs()?)
            .with_no_client_auth()
    } else {
-        use rustls::client::danger::{HandshakeSignatureValid, ServerCertVerified};
-        #[derive(Debug)]
-        struct AcceptAll(Arc<WebPkiServerVerifier>);
-        impl ServerCertVerifier for AcceptAll {
-            fn verify_server_cert(
-                &self,
-                end_entity: &rustls::pki_types::CertificateDer<'_>,
-                intermediates: &[rustls::pki_types::CertificateDer<'_>],
-                server_name: &rustls::pki_types::ServerName<'_>,
-                ocsp_response: &[u8],
-                now: rustls::pki_types::UnixTime,
-            ) -> Result<ServerCertVerified, rustls::Error> {
-                let r = self.0.verify_server_cert(
-                    end_entity,
-                    intermediates,
-                    server_name,
-                    ocsp_response,
-                    now,
-                );
-                if let Err(err) = r {
-                    tracing::info!(
-                        ?server_name,
-                        "ignoring db connection TLS validation error: {err:?}"
-                    );
-                    return Ok(ServerCertVerified::assertion());
-                }
-                r
-            }
-            fn verify_tls12_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls12_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.2 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn verify_tls13_signature(
-                &self,
-                message: &[u8],
-                cert: &rustls::pki_types::CertificateDer<'_>,
-                dss: &rustls::DigitallySignedStruct,
-            ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error>
-            {
-                let r = self.0.verify_tls13_signature(message, cert, dss);
-                if let Err(err) = r {
-                    tracing::info!(
-                        "ignoring db connection 1.3 signature TLS validation error: {err:?}"
-                    );
-                    return Ok(HandshakeSignatureValid::assertion());
-                }
-                r
-            }
-            fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-                self.0.supported_verify_schemes()
-            }
-        }
        let verifier = AcceptAll(
            WebPkiServerVerifier::builder_with_provider(
                load_certs()?,
@@ -1389,6 +1372,29 @@ fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<Async
    fut.boxed()
 }

+#[cfg_attr(test, test)]
+fn test_config_debug_censors_password() {
+    let has_pw =
+        "host=/var/lib/postgresql,localhost port=1234 user=specialuser password='NOT ALLOWED TAG'";
+    let has_pw_cfg = has_pw.parse::<tokio_postgres::Config>().unwrap();
+    assert!(format!("{has_pw_cfg:?}").contains("specialuser"));
+    // Ensure that the password is not leaked by the debug impl
+    assert!(!format!("{has_pw_cfg:?}").contains("NOT ALLOWED TAG"));
+}
+
+fn log_postgres_connstr_info(config_str: &str) -> anyhow::Result<()> {
+    let config = config_str
+        .parse::<tokio_postgres::Config>()
+        .map_err(|_e| anyhow::anyhow!("Couldn't parse config str"))?;
+    // We use debug formatting here, and use a unit test to ensure that we don't leak the password.
+    // To make extra sure the test gets ran, run it every time the function is called
+    // (this is rather cold code, we can afford it).
+    #[cfg(not(test))]
+    test_config_debug_censors_password();
+    tracing::info!("database connection config: {config:?}");
+    Ok(())
+}
+
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
 #[derive(
    QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq,
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
        }
    }

+    pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
+        Self {
+            config: ReconcilerConfig {
+                tenant_creation_hint: hint,
+                ..self.config
+            },
+        }
+    }
+
    pub(crate) fn build(self) -> ReconcilerConfig {
        self.config
    }
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
    // During live migrations this is the amount of time that
    // the pagserver will hold our poll.
    secondary_download_request_timeout: Option<Duration>,
+
+    // A hint indicating whether this reconciliation is done on the
+    // creation of a new tenant. This only informs logging behaviour.
+    tenant_creation_hint: bool,
 }

 impl ReconcilerConfig {
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
        self.secondary_download_request_timeout
            .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
    }
+
+    pub(crate) fn tenant_creation_hint(&self) -> bool {
+        self.tenant_creation_hint
+    }
 }

 /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -934,16 +951,35 @@ impl Reconciler {
                )
                .await;
            if let Err(e) = &result {
-                // It is up to the caller whether they want to drop out on this error, but they don't have to:
-                // in general we should avoid letting unavailability of the cloud control plane stop us from
-                // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
-                }
-
                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                // needs to retry at some point.
                self.compute_notify_failure = true;
+
+                // It is up to the caller whether they want to drop out on this error, but they don't have to:
+                // in general we should avoid letting unavailability of the cloud control plane stop us from
+                // making progress.
+                match e {
+                    // 404s from cplane during tenant creation are expected.
+                    // Cplane only persists the shards to the database after
+                    // creating the tenant and the timeline. If we notify before
+                    // that, we'll get a 404.
+                    //
+                    // This is fine because tenant creations happen via /location_config
+                    // and that returns the list of locations in the response. Hence, we
+                    // silence the error and return Ok(()) here. Reconciliation will still
+                    // be retried because we set [`Reconciler::compute_notify_failure`] above.
+                    NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
+                        if self.reconciler_config.tenant_creation_hint() =>
+                    {
+                        return Ok(());
+                    }
+                    NotifyError::ShuttingDown => {}
+                    _ => {
+                        tracing::warn!(
+                            "Failed to notify compute of attached pageserver {node}: {e}"
+                        );
+                    }
+                }
            }
            result
        } else {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2238,9 +2238,14 @@ impl Service {
        let waiters = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, _scheduler) = locked.parts_mut();
+            let config = ReconcilerConfigBuilder::new()
+                .tenant_creation_hint(true)
+                .build();
            tenants
                .range_mut(TenantShardId::tenant_range(tenant_id))
-                .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
+                .filter_map(|(_shard_id, shard)| {
+                    self.maybe_configured_reconcile_shard(shard, nodes, config)
+                })
                .collect::<Vec<_>>()
        };

--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -707,6 +707,7 @@ impl TenantShard {
                if let Some(node_id) = self.intent.get_attached() {
                    // Populate secondary by demoting the attached node
                    self.intent.demote_attached(scheduler, *node_id);
+
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
@@ -979,24 +980,51 @@ impl TenantShard {
                        ),
                    )
                })
-                .collect::<Vec<_>>();
+                .collect::<HashMap<_, _>>();

            if secondary_scores.iter().any(|score| score.1.is_none()) {
-                // Don't have full list of scores, so can't make a good decision about which to drop unless
-                // there is an obvious one in the wrong AZ
-                for secondary in self.intent.get_secondary() {
-                    if scheduler.get_node_az(secondary) == self.intent.preferred_az_id {
+                // Trivial case: if we only have one secondary, drop that one
+                if self.intent.get_secondary().len() == 1 {
+                    return Some(ScheduleOptimization {
+                        sequence: self.sequence,
+                        action: ScheduleOptimizationAction::RemoveSecondary(
+                            *self.intent.get_secondary().first().unwrap(),
+                        ),
+                    });
+                }
+
+                // Try to find a "good" secondary to keep, without relying on scores (one or more nodes is in a state
+                // where its score can't be calculated), and drop the others.  This enables us to make progress in
+                // most cases, even if some nodes are offline or have scheduling=pause set.
+
+                debug_assert!(self.intent.attached.is_some()); // We should not make it here unless attached -- this
+                                                               // logic presumes we are in a mode where we want secondaries to be in non-home AZ
+                if let Some(retain_secondary) = self.intent.get_secondary().iter().find(|n| {
+                    let in_home_az = scheduler.get_node_az(n) == self.intent.preferred_az_id;
+                    let is_available = secondary_scores
+                        .get(n)
+                        .expect("Built from same list of nodes")
+                        .is_some();
+                    is_available && !in_home_az
+                }) {
+                    // Great, we found one to retain.  Pick some other to drop.
+                    if let Some(victim) = self
+                        .intent
+                        .get_secondary()
+                        .iter()
+                        .find(|n| n != &retain_secondary)
+                    {
                        return Some(ScheduleOptimization {
                            sequence: self.sequence,
-                            action: ScheduleOptimizationAction::RemoveSecondary(*secondary),
+                            action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                        });
                    }
                }

                // Fall through: we didn't identify one to remove.  This ought to be rare.
                tracing::warn!("Keeping extra secondaries: can't determine which of {:?} to remove (some nodes offline?)",
-                self.intent.get_secondary()
-            );
+                    self.intent.get_secondary()
+                );
            } else {
                let victim = secondary_scores
                    .iter()
@@ -1005,7 +1033,7 @@ impl TenantShard {
                    .0;
                return Some(ScheduleOptimization {
                    sequence: self.sequence,
-                    action: ScheduleOptimizationAction::RemoveSecondary(victim),
+                    action: ScheduleOptimizationAction::RemoveSecondary(*victim),
                });
            }
        }
@@ -2379,6 +2407,110 @@ pub(crate) mod tests {
        Ok(())
    }

+    /// Test how the optimisation code behaves with an extra secondary
+    #[test]
+    fn optimize_removes_secondary() -> anyhow::Result<()> {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+        let mut nodes = make_test_nodes(
+            4,
+            &[
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+                az_a_tag.clone(),
+                az_b_tag.clone(),
+            ],
+        );
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut schedule_context = ScheduleContext::default();
+
+        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        shard_a.intent.preferred_az_id = Some(az_a_tag.clone());
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+
+        // Attached on node 1, secondary on node 2
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(2)]);
+
+        // Initially optimiser is idle
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        // A spare secondary in the home AZ: it should be removed -- this is the situation when we're midway through a graceful migration, after cutting over
+        // to our new location
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(3))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary in the non-home AZ, and one of them is offline
+        shard_a.intent.push_secondary(&mut scheduler, NodeId(4));
+        nodes
+            .get_mut(&NodeId(4))
+            .unwrap()
+            .set_availability(NodeAvailability::Offline);
+        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(4))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+
+        // A spare secondary when should have none
+        shard_a.policy = PlacementPolicy::Attached(0);
+        let optimization = shard_a.optimize_attachment(&mut scheduler, &schedule_context);
+        assert_eq!(
+            optimization,
+            Some(ScheduleOptimization {
+                sequence: shard_a.sequence,
+                action: ScheduleOptimizationAction::RemoveSecondary(NodeId(2))
+            })
+        );
+        shard_a.apply_optimization(&mut scheduler, optimization.unwrap());
+        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
+        assert_eq!(shard_a.intent.get_secondary(), &vec![]);
+
+        // Check that in secondary mode, we preserve the secondary in the preferred AZ
+        let mut schedule_context = ScheduleContext::default(); // Fresh context, we're about to call schedule()
+        shard_a.policy = PlacementPolicy::Secondary;
+        shard_a
+            .schedule(&mut scheduler, &mut schedule_context)
+            .unwrap();
+        assert_eq!(shard_a.intent.get_attached(), &None);
+        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
+        assert_eq!(
+            shard_a.optimize_attachment(&mut scheduler, &schedule_context),
+            None
+        );
+        assert_eq!(
+            shard_a.optimize_secondary(&mut scheduler, &schedule_context),
+            None
+        );
+
+        shard_a.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+
    // Optimize til quiescent: this emulates what Service::optimize_all does, when
    // called repeatedly in the background.
    // Returns the applied optimizations
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2766,6 +2766,11 @@ class NeonPageserver(PgProtocol, LogUtils):
            log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
            raise

+    def heatmap_content(self, tenant_shard_id: TenantId | TenantShardId) -> Any:
+        path = self.tenant_dir(tenant_shard_id) / "heatmap-v1.json"
+        with open(path) as f:
+            return json.load(f)
+
    def tenant_create(
        self,
        tenant_id: TenantId,
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -34,16 +34,20 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
    cur.execute("set log_statement = 'all'")
    cur.execute("create table t(x integer)")
    for _ in range(n_iters):
-        cur.execute(f"insert into t values (generate_series(1,{n_records}))")
+        with zenbenchmark.record_duration(f"insert into t values (generate_series(1,{n_records}))"):
+            cur.execute(f"insert into t values (generate_series(1,{n_records}))")
        time.sleep(1)

-    cur.execute("vacuum t")
+    with zenbenchmark.record_duration("vacuum t"):
+        cur.execute("vacuum t")

-    with zenbenchmark.record_duration("test_query"):
+    with zenbenchmark.record_duration("SELECT count(*) from t"):
        cur.execute("SELECT count(*) from t")
        assert cur.fetchone() == (n_iters * n_records,)

-    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
-    env.pageserver.http_client().timeline_checkpoint(
-        tenant, timeline, compact=False, wait_until_uploaded=True
-    )
+    with zenbenchmark.record_duration("flush_ep_to_pageserver"):
+        flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    with zenbenchmark.record_duration("timeline_checkpoint"):
+        env.pageserver.http_client().timeline_checkpoint(
+            tenant, timeline, compact=False, wait_until_uploaded=True
+        )
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
        "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
        "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
        "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
+        "PGOPTIONS": "-c idle_in_transaction_session_timeout=0 -c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
    }
    # Combine the current environment with custom variables
    env = os.environ.copy()
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
    # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }

+PREEMPT_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 1,
+    "image_creation_preempt_threshold": 1,
+    # compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+}
+

@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
 def test_pageserver_compaction_smoke(
-    neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
+    neon_env_builder: NeonEnvBuilder,
+    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
    """
    This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """

-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
+    conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)

    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -113,6 +130,41 @@ page_cache_size=10
    assert vectored_average < 8


+@skip_in_debug_build("only run with release build")
+def test_pageserver_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure image layer creation gets preempted and then resumed
+    env.pageserver.assert_log_contains("resuming image layer creation")
+
+
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
    "with_branches",
@@ -250,6 +302,9 @@ def test_pageserver_gc_compaction_idempotent(
    workload.churn_rows(row_count, env.pageserver.id)
    # compact 3 times if mode is before_restart
    n_compactions = 3 if compaction_mode == "before_restart" else 1
+    ps_http.timeline_compact(
+        tenant_id, timeline_id, force_l0_compaction=True, wait_until_uploaded=True
+    )
    for _ in range(n_compactions):
        # Force refresh gc info to have gc_cutoff generated
        ps_http.timeline_gc(tenant_id, timeline_id, None)
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -95,6 +95,8 @@ def test_remote_extensions(

    # mock remote_extensions spec
    spec: dict[str, Any] = {
+        "public_extensions": ["anon"],
+        "custom_extensions": None,
        "library_index": {
            "anon": "anon",
        },
--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -6,14 +6,9 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient


-def check_tenant(
-    env: NeonEnv, pageserver_http: PageserverHttpClient, safekeeper_proto_version: int
-):
+def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient):
    tenant_id, timeline_id = env.create_tenant()
-    config_lines = [
-        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
-    ]
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    # we rely upon autocommit after each statement
    res_1 = endpoint.safe_psql_many(
        queries=[
@@ -38,14 +33,7 @@ def check_tenant(


@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)])
-# Test both proto versions until we fully migrate.
-@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
-def test_normal_work(
-    neon_env_builder: NeonEnvBuilder,
-    num_timelines: int,
-    num_safekeepers: int,
-    safekeeper_proto_version: int,
-):
+def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int):
    """
    Basic test:
    * create new tenant with a timeline
@@ -64,4 +52,4 @@ def test_normal_work(
    pageserver_http = env.pageserver.http_client()

    for _ in range(num_timelines):
-        check_tenant(env, pageserver_http, safekeeper_proto_version)
+        check_tenant(env, pageserver_http)
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -443,7 +443,7 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
    workload.write_rows(256, env.pageservers[0].id)
    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)

-    def validate_heatmap(heatmap):
+    def validate_heatmap(heatmap, on_disk_heatmap):
        assert len(heatmap["timelines"]) == 1
        assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
        assert len(heatmap["timelines"][0]["layers"]) > 0
@@ -452,10 +452,13 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
        # Each layer appears at most once
        assert len(set(layer["name"] for layer in layers)) == len(layers)

+        assert heatmap == on_disk_heatmap
+
    # Download and inspect the heatmap that the pageserver uploaded
    heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_first_on_disk = env.pageserver.heatmap_content(tenant_id)
    log.info(f"Read back heatmap: {heatmap_first}")
-    validate_heatmap(heatmap_first)
+    validate_heatmap(heatmap_first, heatmap_first_on_disk)

    # Do some more I/O to generate more layers
    workload.churn_rows(64, env.pageservers[0].id)
@@ -463,9 +466,10 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):

    # Ensure that another heatmap upload includes the new layers
    heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_second_on_disk = env.pageserver.heatmap_content(tenant_id)
    log.info(f"Read back heatmap: {heatmap_second}")
    assert heatmap_second != heatmap_first
-    validate_heatmap(heatmap_second)
+    validate_heatmap(heatmap_second, heatmap_second_on_disk)


 def list_elegible_layers(
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -538,16 +538,13 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
    asyncio.run(run_recovery_uncommitted(env))


-async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
+async def run_wal_truncation(env: NeonEnv):
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

    (sk1, sk2, sk3) = env.safekeepers

-    config_lines = [
-        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
-    ]
-    ep = env.endpoints.create_start("main", config_lines=config_lines)
+    ep = env.endpoints.create_start("main")
    ep.safe_psql("create table t (key int, value text)")
    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")

@@ -574,7 +571,6 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
    sk2.start()
    ep = env.endpoints.create_start(
        "main",
-        config_lines=config_lines,
    )
    ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")

@@ -593,13 +589,11 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):

 # Simple deterministic test creating tail of WAL on safekeeper which is
 # truncated when majority without this sk elects walproposer starting earlier.
-# Test both proto versions until we fully migrate.
-@pytest.mark.parametrize("safekeeper_proto_version", [2, 3])
-def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_version: int):
+def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()

-    asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
+    asyncio.run(run_wal_truncation(env))


 async def run_segment_init_failure(env: NeonEnv):
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,11 +1,11 @@
 {
  "v17": [
    "17.2",
-    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
+    "8dfd5a7030d3e8a98b60265ebe045788892ac7f3"
  ],
  "v16": [
    "16.6",
-    "3cf7ce1afab75027716d14223f95ddb300754162"
+    "86d9ea96ebb9088eac62f57f1f5ace68e70e0d1c"
  ],
  "v15": [
    "15.10",
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -92,6 +92,7 @@ tonic = { version = "0.12", default-features = false, features = ["codegen", "pr
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-log = { version = "0.2" }
 url = { version = "2", features = ["serde"] }
 zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
Author	SHA1	Message	Date
BodoBolero	d0d29468d8	one more segfault in handling error messages	2025-02-06 17:43:00 +01:00
BodoBolero	567a665dc4	tried patch locally	2025-02-06 13:52:02 +01:00
BodoBolero	b0007302d0	include patches in repo	2025-02-06 13:37:00 +01:00
BodoBolero	b35dd198c3	fix pgcopydb seg fault and -c idle_in_transaction_session_timeout=0	2025-02-06 11:27:29 +01:00
Arpad Müller	b66fbd6176	Warn on basebackups for archived timelines (#10688 ) We don't want any external requests for an archived timeline. This includes basebackup requests, i.e. when a compute is being started up. Therefore, we'd like to forbid such basebackup requests: any attempt to get a basebackup on an archived timeline (or any getpage request really) is a cplane bug. Make this a warning for now so that, if there is potentially a bug, we can detect cases in the wild before they cause stuck operations, but the intention is to return an error eventually. Related: #9548	2025-02-06 10:09:20 +00:00
Vlad Lazar	95588dab98	safekeeper: fix wal fan-out shard subscription data race (#10677 ) ## Problem [This select arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L414) runs when we want to attach a new reader to the current cursor. It checks the current position of the cursor and resets it if required. The current position of the cursor is updated in the [other select arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L336-L345). That runs when we get some WAL to send. Now, what happens if we want to attach two shards consecutively to the cursor? Let's say [this select arm](https://github.com/neondatabase/neon/blob/main/safekeeper/src/send_interpreted_wal.rs#L397) runs twice in a row. Let's assume cursor is currently at LSN X. First shard wants to attach at position V and the other one at W. Assume X > W > V. First shard resets the stream to position V. Second shard comes in, sees stale cursor position X and resets it to W. This means that the first shard doesn't get wal in the [V, W) range. ## Summary of changes Ultimately, this boils down to the current position not being kept in sync with the reset of the WAL stream. This patch fixes the race by updating it when resetting the WAL stream and adds a unit test repro. Closes https://github.com/neondatabase/cloud/issues/23750	2025-02-06 09:24:28 +00:00
Christian Schwarz	1686d9e733	perf(page_service): dont `.instrument(span.clone())` the response flush (#10686 ) On my AX102 Hetzner box, removing this line removes about 20us from the `latency_mean` result in `test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant`. If the same 20us can be removed in the nightly benchmark run, this will be a ~10% improvement because there, mean latencies are about ~220us. This span was added during batching refactors, we didn't have it before, and I don't think it's terribly useful. refs - https://github.com/neondatabase/cloud/issues/21759	2025-02-06 08:33:37 +00:00
Erik Grinaker	abcd00181c	pageserver: set a concurrency limit for LocalFS (#10676 ) ## Problem The local filesystem backend for remote storage doesn't set a concurrency limit. While it can't/won't enforce a concurrency limit itself, this also bounds the upload queue concurrency. Some tests create thousands of uploads, which slows down the quadratic scheduling of the upload queue, and there is no point spawning that many Tokio tasks. Resolves #10409. ## Summary of changes Set a concurrency limit of 100 for the LocalFS backend. Before: `test_layer_map[release-pg17].test_query: 68.338 s` After: `test_layer_map[release-pg17].test_query: 5.209 s`	2025-02-06 07:24:36 +00:00
Konstantin Knizhnik	01f0be03b5	Fix bugs in lfc_cache_containsv (#10682 ) ## Problem Incorrect manipulations with iteration index in `lfc_cache_containsv` ## Summary of changes ``` - int this_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs); + int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs); int this_chunk = ``` - if (i + 1 >= nblocks) + if (i >= nblocks) ``` Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-02-06 07:00:00 +00:00
Konstantin Knizhnik	81cd30e4d6	Use #ifdef instead of #if USE_ASSERT_CHECKING (#10683 ) ## Problem USE_ASSERT _CHECKING is defined as empty entity. but it is checked using #if ## Summary of changes Replace `#if USE_ASSERT _CHECKING` with `#ifdef USE_ASSERT _CHECKING` as done in other places in Postgres Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-02-06 05:47:56 +00:00
Konstantin Knizhnik	7fc6953da4	Is neon superuser (#10625 ) ## Problem is_neon_superuser() fiunction is public in pg14/pg15 but statically defined in publicationcmd.c in pg16/pg17 ## Summary of changes Make this function public for all Postgres version. It is intended to be used not only in publicationcmd.c See https://github.com/neondatabase/postgres/pull/573 https://github.com/neondatabase/postgres/pull/576 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-02-06 05:42:14 +00:00
Christian Schwarz	77f9e74d86	pgxn: include socket send & recv queue size in slow response logs (#10673 ) # Problem When we see an apparent slow request, one possible cause is that the client is failing to consume responses, but we don't have a clear way to see that. # Solution - Log the socket queue depths on slow/stuck connections, so that we have an indication of whether the compute is keeping up with processing the connection's responses. refs - slack https://neondb.slack.com/archives/C036U0GRMRB/p1738652644396329 - refs https://github.com/neondatabase/cloud/issues/23515 - refs https://github.com/neondatabase/cloud/issues/23486	2025-02-06 01:14:29 +00:00
Alex Chi Z.	0ceeec9be3	fix(pageserver): schedule compaction immediately if pending (#10684 ) ## Problem The code is intended to reschedule compaction immediately if there are pending tasks. We set the duration to 0 before if there are pending tasks, but this will go through the `if period == Duration::ZERO {` branch and sleep for another 10 seconds. ## Summary of changes Set duration to 1 so that it doesn't sleep for too long. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-02-05 22:11:50 +00:00
Alex Chi Z.	733a57247b	fix(pageserver): disallow gc-compaction produce l0 layer (#10679 ) ## Problem Any compaction should never produce l0 layers. This never happened in my experiments, but would be good to guard it early. ## Summary of changes Disallow gc-compaction to produce l0 layers. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-02-05 20:44:28 +00:00
Heikki Linnakangas	6699a30a49	Make it easy to build only a subset of extensions into compute image (#10655 ) The full build of all extensions takes a long time. When working locally on parts that don't need extensions, you can iterate more quickly by skipping the unnecessary extensions. This adds a build argument to the dockerfile to specify extensions to build. There are three options: - EXTENSIONS=all (default) - EXTENSIONS=minimal: Build only a few extensions that are listed in shared_preload_libraries in the default neon config. - EXTENSIONS=none: Build no extensions (except for the mandatory 'neon' extension).	2025-02-05 18:07:51 +00:00
Alex Chi Z.	133b89a83d	feat(pageserver): continue from last incomplete image layer creation (#10660 ) ## Problem close https://github.com/neondatabase/neon/issues/10651 ## Summary of changes * Image layer creation starts from the next partition of the last processed partition if the previous attempt was not complete. * Add tests. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-02-05 17:35:39 +00:00
Arseny Sher	fba22a7123	Record more timings in test_layer_map (#10670 ) ## Problem It it is not very clear how much time take different operations. ## Summary of changes Record more timings. ref https://github.com/neondatabase/neon/issues/10409	2025-02-05 17:00:26 +00:00
John Spray	14e05276a3	storcon: fix a case where optimise could get stuck on unschedulable node (#10648 ) ## Problem When a shard has two secondary locations, but one of them is on a node with MaySchedule::No, the optimiser would get stuck, because it couldn't decide which secondary to remove. This is generally okay if a node is offline, but if a node is in Pause mode for a long period of time, it's a problem. Closes: https://github.com/neondatabase/neon/issues/10646 ## Summary of changes - Instead of insisting on finding a node in the wrong AZ to remove, find an available node in the _right_ AZ, and remove all the others. This ensures that if there is one live suitable node, then other offline/paused nodes cannot hold things up.	2025-02-05 16:05:12 +00:00
Tristan Partin	ebc55e6ae8	Fix logic for checking if a compute can install a remote extension (#10656 ) Given a remote extensions manifest of the following: ```json { "public_extensions": [], "custom_extensions": null, "library_index": { "pg_search": "pg_search" }, "extension_data": { "pg_search": { "control_data": { "pg_search.control": "comment = 'pg_search: Full text search for PostgreSQL using BM25'\ndefault_version = '0.14.1'\nmodule_pathname = '$libdir/pg_search'\nrelocatable = false\nsuperuser = true\nschema = paradedb\ntrusted = true\n" }, "archive_path": "13117844657/v14/extensions/pg_search.tar.zst" } } } ``` We were allowing a compute to install a remote extension that wasn't listed in either public_extensions or custom_extensions. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-02-05 14:58:33 +00:00
Erik Grinaker	f07119cca7	pageserver: add `pageserver_wal_ingest_values_committed` metric (#10653 ) ## Problem We don't have visibility into the ratio of image vs. delta pages ingested in Pageservers. This might be useful to determine whether we should compress WAL records before storing them, which in turn might make compaction more efficient. ## Summary of changes Add `pageserver_wal_ingest_values_committed` metric with dimensions `class=metadata\|data` and `kind=image\|delta`.	2025-02-05 14:33:04 +00:00
Vlad Lazar	47975d06d9	storcon: silence cplane 404s on tenant creation (#10665 ) ## Problem We get WARN log noise on tenant creations. Cplane creates tenants via /location_config. That returns the attached locations in the response and spawns a reconciliation which will also attempt to notify cplane. If the notification is attempted before cplane persists the shards to its database, storcon gets back a 404. The situation is harmless, but annoying. ## Summary of Changes * Add a tenant creation hint to the reconciler config * If the hint is true and we get back a 404 on the notification from cplane, ignore the error, but still queue the reconcile up for a retry. Closes https://github.com/neondatabase/cloud/issues/20732	2025-02-05 12:41:09 +00:00
Fedor Dikarev	472007dd7c	ci: unify Dockerfiles, set bash as SHELL for debian layers, make cpan step as separate RUN (#10645 ) ## Problem Ref: https://github.com/neondatabase/cloud/issues/23461 and follow-up after: https://github.com/neondatabase/neon/pull/10553 we used `echo` to set-up `.wgetrc` and `.curlrc`, and there we used `\n` to make these multiline configs with one echo command. The problem is that Debian `/bin/sh`'s built-in echo command behaves differently from the `/bin/echo` executable and from the `echo` built-in in `bash`. Namely, it does not support the`-e` option, and while it does treat `\n` as a newline, passing `-e` here will add that `-e` to the output. At the same time, when we use different base images, for example `alpine/curl`, their `/bin/sh` supports and requires `-e` for treating escape sequences like `\n`. But having different `echo` and remembering difference in their behaviour isn't best experience for the developer and makes bad experience maintaining Dockerfiles. Work-arounds: - Explicitly use `/bin/bash` (like in this PR) - Use `/bin/echo` instead of the shell's built-in echo function - Use printf "foo\n" instead of echo -e "foo\n" ## Summary of changes 1. To fix that, we process with the option setting `/bin/bash` as a SHELL for the debian-baysed layers 2. With no changes for `alpine/curl` based layers. 3. And one more change here: in `extensions` layer split to the 2 steps: installing dependencies from `CPAN` and installing `lcov` from github, so upgrading `lcov` could reuse previous layer with installed cpan modules.	2025-02-04 18:58:02 +00:00
Vlad Lazar	f9009d6b80	pageserver: write heatmap to disk after uploading it (#10650 ) ## Problem We wish to make heatmap generation additive in https://github.com/neondatabase/neon/pull/10597. However, if the pageserver restarts and has a heatmap on disk from when it was a secondary long ago, we can end up keeping extra layers on the secondary's disk. ## Summary of changes Persist the heatmap after a successful upload.	2025-02-04 17:52:54 +00:00
Alex Chi Z.	cab60b6d9f	fix(pagesever): stablize gc-compaction tests (#10621 ) ## Problem Hopefully this can resolve https://github.com/neondatabase/neon/issues/10517. The reason why the test is flaky is that after restart the compute node might write some data so that the pageserver flush some layers, and in the end, causing L0 compaction to run, and we cannot get the test scenario as we want. ## Summary of changes Ensure all L0 layers are compacted before starting the test. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-02-04 16:11:31 +00:00
Erik Grinaker	06090bbccd	pageserver: log critical error on `ClearVmBits` for unknown pages (#10634 ) ## Problem In #9895, we fixed some issues where `ClearVmBits` were broadcast to all shards, even those not owning the VM relation. As part of that, we found some ancient code from #1417, which discarded spurious incorrect `ClearVmBits` records for pages outside of the VM relation. We added observability in #9911 to see how often this actually happens in the wild. After two months, we have not seen this happen once in production or staging. However, out of caution, we don't want a hard error and break WAL ingestion. Resolves #10067. ## Summary of changes Log a critical error when ingesting `ClearVmBits` for unknown VM relations or pages.	2025-02-04 14:55:11 +00:00
Folke Behrens	dcf335a251	proxy: Switch proxy to JSON logging (#9857 ) ## Problem We want to switch proxy and ideally all Rust services to structured JSON logging to support better filtering and cross-referencing with tracing. ## Summary of changes * Introduce a custom tracing-subscriber to write the JSON. In a first attempt a customized tracing::fmt::FmtSubscriber was used, but it's very inefficient and can still generate invalid JSON. It's also doesn't allow us to add important fields to the root object. * Make this opt in: the `LOGFMT` env var can be set to `"json"` to enable to new logger at startup.	2025-02-04 14:50:53 +00:00
Arpad Müller	b6e9daea9a	storcon: only allow errrors of the server cert verification (#10644 ) This PR does a bunch of things: * only allow errors of the server cert verification, not of the TLS handshake. The TLS handshake doesn't cause any errors for us so we can just always require it to be valid. This simplifies the code a little. * As the solution is more permanent than originally anticipated, I think it makes sense to move the `AcceptAll` verifier outside. * log the connstr information. this helps with figuring out which domain names are configured in the connstr, etc. I think it is generally useful to print it. make extra sure that the password is not leaked. Follow-up of #10640	2025-02-04 14:01:57 +00:00
a-masterov	d5c3a4e2b9	Add support for pgjwt test (#10611 ) ## Problem We don't currently test pgjwt, while it is based on pg_prove and can be easily added ## Summary of changes The test for pgjwt was added.	2025-02-04 13:49:44 +00:00
Heikki Linnakangas	8107140f7f	Refactor compute dockerfile (#10371 ) Refactor how extensions are built in compute Dockerfile 1. Rename some of the extension layers, so that names correspond more precisely to the upstream repository name and the source directory name. For example, instead of "pg-jsonschema-pg-build", spell it "pg_jsonschema-build". Some of the layer names had the extra "pg-" part, and some didn't; harmonize on not having it. And use an underscore if the upstream project name uses an underscore. 2. Each extension now consists of two dockerfile targets: [extension]-src and [extension]-build. By convention, the -src target downloads the sources and applies any neon-specific patches if necessary. The source tarball is downloaded and extracted under /ext-src. For example, the 'pgvector' extension creates the following files and directory: /ext-src/pgvector.tar.gz # original tarball /ext-src/pgvector.patch # neon-specific patch, copied from patches/ dir /ext-src/pgvector-src/ # extracted tarball, with patch applied This separation avoids re-downloading the sources every time the extension is recompiled. The 'extension-tests' target also uses the [extension]-src layers, by copying the /ext-src/ dirs from all the extensions together into one image. This refactoring came about when I was experimenting with different ways of splitting up the Dockerfile so that each extension would be in a separate file. That's not part of this PR yet, but this is a good step in modularizing the extensions.	2025-02-04 10:35:43 +00:00