Merge pull request #5577 from neondatabase/releases/2023-10-17

Release 2023-10-17
limit imitate accesses concurrency, using same semaphore as compactions (#5578 )
2026-05-13 03:00:37 +00:00 · 2023-10-17 12:21:20 +02:00 · 2023-10-17 12:16:26 +02:00 · 2023-10-11 11:16:39 +03:00 · 2023-10-10 14:16:47 +03:00 · 2023-10-04 13:53:37 +03:00
113 changed files with 2815 additions and 7498 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -320,9 +320,6 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

-      - name: Build walproposer-lib
-        run: mold -run make walproposer-lib -j$(nproc)
-
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -837,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.17.12

    steps:
      - name: Checkout
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -32,7 +32,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -90,21 +90,18 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v14 -j$(nproc)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v15 -j$(nproc)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(sysctl -n hw.ncpu)
+        run: make postgres-v16 -j$(nproc)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
-
-      - name: Build walproposer-lib
-        run: make walproposer-lib -j$(sysctl -n hw.ncpu)
+        run: make neon-pg-ext -j$(nproc)

      - name: Run cargo build
        run: cargo build --all --release
@@ -129,7 +126,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          submodules: true
          fetch-depth: 1
@@ -138,9 +135,6 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

-      - name: Build walproposer-lib
-        run: make walproposer-lib -j$(nproc)
-
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,3 @@ test_output/
 *.o
 *.so
 *.Po
-
-# pgindent typedef lists
-*.list
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,7 @@ dependencies = [
 "log",
 "parking",
 "polling",
- "rustix 0.37.25",
+ "rustix 0.37.19",
 "slab",
 "socket2 0.4.9",
 "waker-fn",
@@ -853,27 +853,6 @@ dependencies = [
 "uuid",
 ]

-[[package]]
-name = "azure_identity"
-version = "0.16.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
-dependencies = [
- "async-lock",
- "async-trait",
- "azure_core",
- "futures",
- "log",
- "oauth2",
- "pin-project",
- "serde",
- "serde_json",
- "time 0.3.21",
- "tz-rs",
- "url",
- "uuid",
-]
-
 [[package]]
 name = "azure_storage"
 version = "0.16.0"
@@ -1152,11 +1131,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
 dependencies = [
 "iana-time-zone",
- "js-sys",
 "num-integer",
 "num-traits",
 "serde",
- "wasm-bindgen",
 "winapi",
 ]

@@ -2582,7 +2559,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
 "io-lifetimes",
- "rustix 0.37.25",
+ "rustix 0.37.19",
 "windows-sys 0.48.0",
 ]

@@ -2972,34 +2949,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "num_threads"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "oauth2"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f"
-dependencies = [
- "base64 0.13.1",
- "chrono",
- "getrandom 0.2.9",
- "http",
- "rand 0.8.5",
- "serde",
- "serde_json",
- "serde_path_to_error",
- "sha2 0.10.6",
- "thiserror",
- "url",
-]
-
 [[package]]
 name = "object"
 version = "0.30.3"
@@ -3561,7 +3510,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3574,7 +3523,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3585,7 +3534,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3603,7 +3552,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3734,7 +3683,7 @@ dependencies = [
 "byteorder",
 "hex",
 "lazy_static",
- "rustix 0.36.16",
+ "rustix 0.36.14",
 ]

 [[package]]
@@ -4072,7 +4021,6 @@ dependencies = [
 "aws-smithy-http",
 "aws-types",
 "azure_core",
- "azure_identity",
 "azure_storage",
 "azure_storage_blobs",
 "bytes",
@@ -4317,9 +4265,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.36.16"
+version = "0.36.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
+checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62"
 dependencies = [
 "bitflags",
 "errno",
@@ -4331,9 +4279,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.37.25"
+version = "0.37.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
+checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
 dependencies = [
 "bitflags",
 "errno",
@@ -5174,7 +5122,7 @@ dependencies = [
 "cfg-if",
 "fastrand 1.9.0",
 "redox_syscall 0.3.5",
- "rustix 0.37.25",
+ "rustix 0.37.19",
 "windows-sys 0.45.0",
 ]

@@ -5271,8 +5219,6 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
 "itoa",
 "js-sys",
- "libc",
- "num_threads",
 "serde",
 "time-core",
 "time-macros 0.2.9",
@@ -5407,7 +5353,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5422,7 +5368,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
- "socket2 0.5.3",
+ "socket2 0.4.9",
 "tokio",
 "tokio-util",
 ]
@@ -5824,15 +5770,6 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"

-[[package]]
-name = "tz-rs"
-version = "0.6.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
-dependencies = [
- "const_fn",
-]
-
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -6092,17 +6029,6 @@ dependencies = [
 "winapi-util",
 ]

-[[package]]
-name = "walproposer"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "bindgen",
- "postgres_ffi",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "want"
 version = "0.3.0"
@@ -6508,6 +6434,7 @@ dependencies = [
 "serde",
 "serde_json",
 "smallvec",
+ "socket2 0.4.9",
 "standback",
 "syn 1.0.109",
 "syn 2.0.28",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,6 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
-    "libs/walproposer",
 ]

 [workspace.package]
@@ -37,10 +36,9 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
-azure_core = "0.16"
-azure_identity = "0.16"
+azure_core = "0.16.0"
 azure_storage = "0.16"
-azure_storage_blobs = "0.16"
+azure_storage_blobs = "0.16.0"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -161,11 +159,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -186,7 +184,6 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
-walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -202,7 +199,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/76
+++ b/76
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib
+neon: postgres-headers
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -168,42 +168,6 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

-# Build walproposer as a static library. walproposer source code is located
-# in the pgxn/neon directory.
-# 
-# We also need to include libpgport.a and libpgcommon.a, because walproposer
-# uses some functions from those libraries.
-# 
-# Some object files are removed from libpgport.a and libpgcommon.a because
-# they depend on openssl and other libraries that are not included in our
-# Rust build.
-.PHONY: walproposer-lib
-walproposer-lib: neon-pg-ext-v16
-	+@echo "Compiling walproposer-lib"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-ifeq ($(UNAME_S),Linux)
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
-		pg_strong_random.o
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
-		pg_crc32c.o \
-		hmac_openssl.o \
-		cryptohash_openssl.o \
-		scram-common.o \
-		md5_common.o \
-		checksum_helper.o
-endif
-
-.PHONY: walproposer-lib-clean
-walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
@@ -256,44 +220,6 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

-postgres-%-pg-bsd-indent: postgres-%
-	+@echo "Compiling pg_bsd_indent"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
-
-# Create typedef list for the core. Note that generally it should be combined with
-# buildfarm one to cover platform specific stuff.
-# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
-postgres-%-typedefs.list: postgres-%
-	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
-
-# Indent postgres. See src/tools/pgindent/README for details.
-.PHONY: postgres-%-pgindent
-postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
-	+@echo merge with buildfarm typedef to cover all platforms
-	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
-		REL_16_STABLE list misses PGSemaphoreData
-	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
-	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
-		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	+@echo note: you might want to run it on selected files/dirs instead.
-	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
-		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
-
-# Indent pxgn/neon.
-.PHONY: pgindent
-neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
-
-
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
-See vendor/postgres-vX/COPYRIGHT for details.
+The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
+PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,7 +302,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
            r#"fsync = off
-wal_level = logical
+wal_level = replica
 hot_standby = on
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -253,7 +253,7 @@ impl Endpoint {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_level", "logical");
+        conf.append("wal_level", "replica");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -25,7 +25,7 @@
            },
            {
                "name": "wal_level",
-                "value": "logical",
+                "value": "replica",
                "vartype": "enum"
            },
            {
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -188,60 +188,11 @@ that.

 ## Error message style

-### PostgreSQL extensions
-
 PostgreSQL has a style guide for writing error messages:

 https://www.postgresql.org/docs/current/error-style-guide.html

 Follow that guide when writing error messages in the PostgreSQL
-extensions.
-
-### Neon Rust code
-
-#### Anyhow Context
-
-When adding anyhow `context()`, use form `present-tense-verb+action`.
-
-Example:
- Bad: `file.metadata().context("could not get file metadata")?;`
- Good: `file.metadata().context("get file metadata")?;`
-
-#### Logging Errors
-
-When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
-
-If `e` is an `anyhow` error and you want to log the backtrace that it contains,
-use `{e:?}` instead of `{e:#}`.
-
-#### Rationale
-
-The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
-
-For example, the following Rust code will result in output
-```
-ERROR  failed to list users: load users from server: parse response: invalid json
-```
-
-This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
-
-```
-ERROR  could not list users: could not load users from server: could not parse response: invalid json
-```
-
-
-```rust
-fn main() {
-  match list_users().context("list users") else {
-    Ok(_) => ...,
-    Err(e) => tracing::error!("failed to {e:#}"),
-  }
-}
-fn list_users() {
-  http_get_users().context("load users from server")?;
-}
-fn http_get_users() {
-  let response = client....?;
-  response.parse().context("parse response")?; // fails with serde error "invalid json"
-}
-```
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -76,7 +76,7 @@
            },
            {
                "name": "wal_level",
-                "value": "logical",
+                "value": "replica",
                "vartype": "enum"
            },
            {
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Result, Write};

-/// A wrapper for an object implementing [Read]
+/// A wrapper for an object implementing [Read](std::io::Read)
 /// which allows a closure to observe the amount of bytes read.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Read] implementor
+    /// Get an immutable reference to the underlying [Read](std::io::Read) implementor
    pub fn inner(&self) -> &T {
        &self.reader
    }

-    /// Get a mutable reference to the underlying [Read] implementor
+    /// Get a mutable reference to the underlying [Read](std::io::Read) implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.reader
    }

-    /// Consume the wrapper and return the underlying [Read] implementor
+    /// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
    pub fn into_inner(self) -> T {
        self.reader
    }
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
    }
 }

-/// A wrapper for an object implementing [Write]
+/// A wrapper for an object implementing [Write](std::io::Write)
 /// which allows a closure to observe the amount of bytes written.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Write] implementor
+    /// Get an immutable reference to the underlying [Write](std::io::Write) implementor
    pub fn inner(&self) -> &T {
        &self.writer
    }

-    /// Get a mutable reference to the underlying [Write] implementor
+    /// Get a mutable reference to the underlying [Write](std::io::Write) implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.writer
    }

-    /// Consume the wrapper and return the underlying [Write] implementor
+    /// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
    pub fn into_inner(self) -> T {
        self.writer
    }
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
-    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
-    SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
+    SQLSTATE_SUCCESSFUL_COMPLETION,
 };

 /// An error, occurred during query processing:
@@ -30,9 +30,6 @@ pub enum QueryError {
    /// The connection was lost while processing the query.
    #[error(transparent)]
    Disconnected(#[from] ConnectionError),
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +44,7 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
-            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Disconnected(_) => b"08006",         // connection failure
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -400,20 +396,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        // socket might be already closed, e.g. if previously received error,
        // so ignore result.
        self.framed.shutdown().await.ok();
-        match ret {
-            Ok(()) => Ok(()),
-            Err(QueryError::Shutdown) => {
-                info!("Stopped due to shutdown");
-                Ok(())
-            }
-            Err(QueryError::Disconnected(e)) => {
-                info!("Disconnected ({e:#})");
-                // Disconnection is not an error: we just use it that way internally to drop
-                // out of loops.
-                Ok(())
-            }
-            e => e,
-        }
+        ret
    }

    async fn run_message_loop<F, S>(
@@ -433,11 +416,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
-                return Err(QueryError::Shutdown)
+                return Ok(())
            },

-            handshake_r = self.handshake(handler) => {
-                handshake_r?;
+            result = self.handshake(handler) => {
+                // Handshake complete.
+                result?;
+                if self.state == ProtoState::Closed {
+                    return Ok(()); // EOF during handshake
+                }
            }
        );

@@ -448,7 +435,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
-                return Err(QueryError::Shutdown)
+                Ok(None)
            },
            msg = self.read_message() => { msg },
        )? {
@@ -460,14 +447,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                _ = shutdown_watcher() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");
-
-                    // If we exited process_message with a shutdown error, there may be
-                    // some valid response content on in our transmit buffer: permit sending
-                    // this within a short timeout.  This is a best effort thing so we don't
-                    // care about the result.
-                    tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
-
-                    return Err(QueryError::Shutdown)
+                    return Ok(())
                },
                flush_r = self.flush() => {
                    flush_r?;
@@ -580,9 +560,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
-                        ProtocolError::Protocol("EOF during handshake".to_string()),
-                    )));
+                    return Ok(());
                }
            }
        }
@@ -621,9 +599,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
-                        ProtocolError::Protocol("EOF during auth".to_string()),
-                    )));
+                    return Ok(());
                }
            }
        }
@@ -947,7 +923,6 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Shutdown => "shutdown".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -964,9 +939,6 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
-        QueryError::Shutdown => {
-            info!("query handler for '{query}' cancelled during tenant shutdown")
-        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -131,7 +131,6 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
-pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -220,10 +220,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-/* From replication/slot.h */
-pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
-   + 64 /* NameData */  + 4*4;
-
 /* From fsm_internals.h */
 const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
 const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -136,42 +136,21 @@ pub fn get_current_timestamp() -> TimestampTz {
    to_pg_timestamp(SystemTime::now())
 }

-// Module to reduce the scope of the constants
-mod timestamp_conversions {
-    use std::time::Duration;
-
-    use super::*;
-
-    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
-    const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
+pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
+    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
+    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
-        (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
-
-    pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
-        match time.duration_since(SystemTime::UNIX_EPOCH) {
-            Ok(n) => {
-                ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
-                    + n.subsec_micros() as u64) as i64
-            }
-            Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    match time.duration_since(SystemTime::UNIX_EPOCH) {
+        Ok(n) => {
+            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
+                * USECS_PER_SEC
+                + n.subsec_micros() as u64) as i64
        }
-    }
-
-    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
-        let time: u64 = time
-            .try_into()
-            .expect("timestamp before millenium (postgres epoch)");
-        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
-        SystemTime::UNIX_EPOCH
-            .checked_add(Duration::from_micros(since_unix_epoch))
-            .expect("SystemTime overflow")
+        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
    }
 }

-pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
-
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
 // the next record). If no valid record after is found, start_lsn is returned
@@ -502,24 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_ts_conversion() {
-        let now = SystemTime::now();
-        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
-
-        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
-        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
-        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
-
-        let now_pg = get_current_timestamp();
-        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
-
-        assert_eq!(now_pg, round_trip_pg);
-    }
-
-    // If you need to craft WAL and write tests for this module, put it at wal_craft crate.
-}
+// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -670,7 +670,6 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
-pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -28,7 +28,6 @@ utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
 azure_core.workspace = true
-azure_identity.workspace = true
 azure_storage.workspace = true
 azure_storage_blobs.workspace = true
 futures-util.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,15 +1,12 @@
 //! Azure Blob Storage wrapper

-use std::env;
 use std::num::NonZeroU32;
-use std::sync::Arc;
 use std::{borrow::Cow, collections::HashMap, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::Header;
-use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{
@@ -41,16 +38,12 @@ impl AzureBlobStorage {
            azure_config.container_name
        );

-        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+        let account =
+            std::env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+        let access_key =
+            std::env::var("AZURE_STORAGE_ACCESS_KEY").expect("missing AZURE_STORAGE_ACCESS_KEY");

-        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
-        // otherwise try the token based credentials.
-        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
-            StorageCredentials::access_key(account.clone(), access_key)
-        } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
-        };
+        let credentials = StorageCredentials::access_key(account.clone(), access_key);

        let builder = ClientBuilder::new(account, credentials);

@@ -121,7 +114,22 @@ impl AzureBlobStorage {
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
-            let part = part.map_err(to_download_error)?;
+            let part = match part {
+                Ok(l) => l,
+                Err(e) => {
+                    return Err(if let Some(http_err) = e.as_http_error() {
+                        match http_err.status() {
+                            StatusCode::NotFound => DownloadError::NotFound,
+                            StatusCode::BadRequest => {
+                                DownloadError::BadInput(anyhow::Error::new(e))
+                            }
+                            _ => DownloadError::Other(anyhow::Error::new(e)),
+                        }
+                    } else {
+                        DownloadError::Other(e.into())
+                    });
+                }
+            };
            let data = part
                .data
                .collect()
@@ -142,16 +150,30 @@ impl AzureBlobStorage {
    ) -> Result<StorageMetadata, DownloadError> {
        let builder = blob_client.get_metadata();

-        let response = builder.into_future().await.map_err(to_download_error)?;
-        let mut map = HashMap::new();
+        match builder.into_future().await {
+            Ok(r) => {
+                let mut map = HashMap::new();

-        for md in response.metadata.iter() {
-            map.insert(
-                md.name().as_str().to_string(),
-                md.value().as_str().to_string(),
-            );
+                for md in r.metadata.iter() {
+                    map.insert(
+                        md.name().as_str().to_string(),
+                        md.value().as_str().to_string(),
+                    );
+                }
+                Ok(StorageMetadata(map))
+            }
+            Err(e) => {
+                return Err(if let Some(http_err) = e.as_http_error() {
+                    match http_err.status() {
+                        StatusCode::NotFound => DownloadError::NotFound,
+                        StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
+                        _ => DownloadError::Other(anyhow::Error::new(e)),
+                    }
+                } else {
+                    DownloadError::Other(e.into())
+                });
+            }
        }
-        Ok(StorageMetadata(map))
    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
@@ -170,18 +192,6 @@ fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
    res
 }

-fn to_download_error(error: azure_core::Error) -> DownloadError {
-    if let Some(http_err) = error.as_http_error() {
-        match http_err.status() {
-            StatusCode::NotFound => DownloadError::NotFound,
-            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
-            _ => DownloadError::Other(anyhow::Error::new(error)),
-        }
-    } else {
-        DownloadError::Other(error.into())
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
    async fn list_prefixes(
@@ -216,8 +226,23 @@ impl RemoteStorage for AzureBlobStorage {

        let mut response = builder.into_stream();
        let mut res = Vec::new();
-        while let Some(entry) = response.next().await {
-            let entry = entry.map_err(to_download_error)?;
+        while let Some(l) = response.next().await {
+            let entry = match l {
+                Ok(l) => l,
+                Err(e) => {
+                    return Err(if let Some(http_err) = e.as_http_error() {
+                        match http_err.status() {
+                            StatusCode::NotFound => DownloadError::NotFound,
+                            StatusCode::BadRequest => {
+                                DownloadError::BadInput(anyhow::Error::new(e))
+                            }
+                            _ => DownloadError::Other(anyhow::Error::new(e)),
+                        }
+                    } else {
+                        DownloadError::Other(e.into())
+                    });
+                }
+            };
            let name_iter = entry
                .blobs
                .prefixes()
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
-usage and sends rolling aggregates to the runner.
+* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
+listening for `memory.high` events and setting its `memory.{high,max}` values.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,38 +1,161 @@
-use std::fmt::{self, Debug, Formatter};
-use std::time::{Duration, Instant};
-
-use anyhow::{anyhow, Context};
-use cgroups_rs::{
-    hierarchies::{self, is_cgroup2_unified_mode},
-    memory::MemController,
-    Subsystem,
+use std::{
+    fmt::{Debug, Display},
+    fs,
+    pin::pin,
+    sync::atomic::{AtomicU64, Ordering},
 };
-use tokio::sync::watch;
+
+use anyhow::{anyhow, bail, Context};
+use cgroups_rs::{
+    freezer::FreezerController,
+    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    memory::MemController,
+    MaxValue,
+    Subsystem::{Freezer, Mem},
+};
+use inotify::{EventStream, Inotify, WatchMask};
+use tokio::sync::mpsc::{self, error::TryRecvError};
+use tokio::time::{Duration, Instant};
+use tokio_stream::{Stream, StreamExt};
 use tracing::{info, warn};

+use crate::protocol::Resources;
+use crate::MiB;
+
+/// Monotonically increasing counter of the number of memory.high events
+/// the cgroup has experienced.
+///
+/// We use this to determine if a modification to the `memory.events` file actually
+/// changed the `high` field. If not, we don't care about the change. When we
+/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
+/// to see if it changed since last time.
+pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Monotonically increasing counter that gives each cgroup event a unique id.
+///
+/// This allows us to answer questions like "did this upscale arrive before this
+/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
+/// with a sequence number. As such, prefer to used the `Sequenced` type rather
+/// than this static directly.
+static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
+
+/// A memory event type reported in memory.events.
+#[derive(Debug, Eq, PartialEq, Copy, Clone)]
+pub enum MemoryEvent {
+    Low,
+    High,
+    Max,
+    Oom,
+    OomKill,
+    OomGroupKill,
+}
+
+impl MemoryEvent {
+    fn as_str(&self) -> &str {
+        match self {
+            MemoryEvent::Low => "low",
+            MemoryEvent::High => "high",
+            MemoryEvent::Max => "max",
+            MemoryEvent::Oom => "oom",
+            MemoryEvent::OomKill => "oom_kill",
+            MemoryEvent::OomGroupKill => "oom_group_kill",
+        }
+    }
+}
+
+impl Display for MemoryEvent {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    /// Interval at which we should be fetching memory statistics
-    memory_poll_interval: Duration,
+    // The target difference between the total memory reserved for the cgroup
+    // and the value of the cgroup's memory.high.
+    //
+    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
+    // use (equal to system memory, minus whatever's taken out for the file cache).
+    oom_buffer_bytes: u64,

-    /// The number of samples used in constructing aggregated memory statistics
-    memory_history_len: usize,
-    /// The number of most recent samples that will be periodically logged.
-    ///
-    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
-    /// logged less frequently, and vice versa.
-    ///
-    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
-    memory_history_log_interval: usize,
+    // The amount of memory, in bytes, below a proposed new value for
+    // memory.high that the cgroup's memory usage must be for us to downscale
+    //
+    // In other words, we can downscale only when:
+    //
+    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
+    //
+    // TODO: there's some minor issues with this approach -- in particular, that we might have
+    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
+    pub(crate) memory_high_buffer_bytes: u64,
+
+    // The maximum duration, in milliseconds, that we're allowed to pause
+    // the cgroup for while waiting for the autoscaler-agent to upscale us
+    max_upscale_wait: Duration,
+
+    // The required minimum time, in milliseconds, that we must wait before re-freezing
+    // the cgroup while waiting for the autoscaler-agent to upscale us.
+    do_not_freeze_more_often_than: Duration,
+
+    // The amount of memory, in bytes, that we should periodically increase memory.high
+    // by while waiting for the autoscaler-agent to upscale us.
+    //
+    // This exists to avoid the excessive throttling that happens when a cgroup is above its
+    // memory.high for too long. See more here:
+    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
+    memory_high_increase_by_bytes: u64,
+
+    // The period, in milliseconds, at which we should repeatedly increase the value
+    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
+    // is still being hit.
+    //
+    // Technically speaking, this actually serves as a rate limit to moderate responding to
+    // memory.high events, but these are roughly equivalent if the process is still allocating
+    // memory.
+    memory_high_increase_every: Duration,
+}
+
+impl Config {
+    /// Calculate the new value for the cgroups memory.high based on system memory
+    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
+        total_system_mem.saturating_sub(self.oom_buffer_bytes)
+    }
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
-            memory_poll_interval: Duration::from_millis(100),
-            memory_history_len: 5, // use 500ms of history for decision-making
-            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
+            oom_buffer_bytes: 100 * MiB,
+            memory_high_buffer_bytes: 100 * MiB,
+            // while waiting for upscale, don't freeze for more than 20ms every 1s
+            max_upscale_wait: Duration::from_millis(20),
+            do_not_freeze_more_often_than: Duration::from_millis(1000),
+            // while waiting for upscale, increase memory.high by 10MiB every 25ms
+            memory_high_increase_by_bytes: 10 * MiB,
+            memory_high_increase_every: Duration::from_millis(25),
+        }
+    }
+}
+
+/// Used to represent data that is associated with a certain point in time, such
+/// as an upscale request or memory.high event.
+///
+/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
+/// a unique sequence number. Sequence numbers are monotonically increasing,
+/// allowing us to answer questions like "did this upscale happen after this
+/// memory.high event?" by comparing the sequence numbers of the two events.
+#[derive(Debug, Clone)]
+pub struct Sequenced<T> {
+    seqnum: u64,
+    data: T,
+}
+
+impl<T> Sequenced<T> {
+    pub fn new(data: T) -> Self {
+        Self {
+            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
+            data,
        }
    }
 }
@@ -47,14 +170,74 @@ impl Default for Config {
 pub struct CgroupWatcher {
    pub config: Config,

+    /// The sequence number of the last upscale.
+    ///
+    /// If we receive a memory.high event that has a _lower_ sequence number than
+    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
+    /// can safely ignore it.
+    ///
+    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
+    /// use it anyways so that methods take `&self`, not `&mut self`.
+    last_upscale_seqnum: AtomicU64,
+
+    /// A channel on which we send messages to request upscale from the dispatcher.
+    upscale_requester: mpsc::Sender<()>,
+
    /// The actual cgroup we are watching and managing.
    cgroup: cgroups_rs::Cgroup,
 }

+/// Read memory.events for the desired event type.
+///
+/// `path` specifies the path to the desired `memory.events` file.
+/// For more info, see the `memory.events` section of the [kernel docs]
+/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
+fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
+    let contents = fs::read_to_string(path)
+        .with_context(|| format!("failed to read memory.events from {path}"))?;
+
+    // Then contents of the file look like:
+    // low 42
+    // high 101
+    // ...
+    contents
+        .lines()
+        .filter_map(|s| s.split_once(' '))
+        .find(|(e, _)| *e == event.as_str())
+        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
+        .and_then(|(_, count)| {
+            count
+                .parse::<u64>()
+                .with_context(|| format!("failed to parse memory.{event} as u64"))
+        })
+}
+
+/// Create an event stream that produces events whenever the file at the provided
+/// path is modified.
+fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
+    info!("creating file watcher for {path}");
+    let inotify = Inotify::init().context("failed to initialize file watcher")?;
+    inotify
+        .watches()
+        .add(path, WatchMask::MODIFY)
+        .with_context(|| format!("failed to start watching {path}"))?;
+    inotify
+        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
+        // to store one event at a time - if the event gets written over, that's
+        // ok. We still see that there is an event. For more information, see:
+        // https://man7.org/linux/man-pages/man7/inotify.7.html
+        .into_event_stream([0u8; 1024])
+        .context("failed to start inotify event stream")
+}
+
 impl CgroupWatcher {
    /// Create a new `CgroupWatcher`.
    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(name: String) -> anyhow::Result<Self> {
+    pub fn new(
+        name: String,
+        // A channel on which to send upscale requests
+        upscale_requester: mpsc::Sender<()>,
+    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
        // TODO: clarify exactly why we need v2
        // Make sure cgroups v2 (aka unified) are supported
        if !is_cgroup2_unified_mode() {
@@ -62,203 +245,410 @@ impl CgroupWatcher {
        }
        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);

-        Ok(Self {
-            cgroup,
-            config: Default::default(),
-        })
+        // Start monitoring the cgroup for memory events. In general, for
+        // cgroups v2 (aka unified), metrics are reported in files like
+        // > `/sys/fs/cgroup/{name}/{metric}`
+        // We are looking for `memory.high` events, which are stored in the
+        // file `memory.events`. For more info, see the `memory.events` section
+        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
+        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
+        let memory_events = create_file_watcher(&path)
+            .with_context(|| format!("failed to create event watcher for {path}"))?
+            // This would be nice with with .inspect_err followed by .ok
+            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
+                Ok(high) => Some(high),
+                Err(error) => {
+                    // TODO: Might want to just panic here
+                    warn!(?error, "failed to read high events count from {}", &path);
+                    None
+                }
+            })
+            // Only report the event if the memory.high count increased
+            .filter_map(|high| {
+                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
+                    Some(high)
+                } else {
+                    None
+                }
+            })
+            .map(Sequenced::new);
+
+        let initial_count = get_event_count(
+            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
+            MemoryEvent::High,
+        )?;
+
+        info!(initial_count, "initial memory.high event count");
+
+        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
+        // running in the cgroup before that caused it to be non-zero.
+        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
+
+        Ok((
+            Self {
+                cgroup,
+                upscale_requester,
+                last_upscale_seqnum: AtomicU64::new(0),
+                config: Default::default(),
+            },
+            memory_events,
+        ))
    }

    /// The entrypoint for the `CgroupWatcher`.
    #[tracing::instrument(skip_all)]
-    pub async fn watch(
+    pub async fn watch<E>(
        &self,
-        updates: watch::Sender<(Instant, MemoryHistory)>,
-    ) -> anyhow::Result<()> {
-        // this requirement makes the code a bit easier to work with; see the config for more.
-        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
+        // These are ~dependency injected~ (fancy, I know) because this function
+        // should never return.
+        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
+        // -> therefore: if we want to stick it in an Arc so many threads can access
+        //    it, methods can never take mutable access.
+        //     - note: we use the Arc strategy so that a) we can call this function
+        //             right here and b) the runner can call the set/get_memory methods
+        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
+        //    we just pass them in here instead of holding them in fields, as that
+        //    would require this method to take &mut self.
+        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
+        events: E,
+    ) -> anyhow::Result<()>
+    where
+        E: Stream<Item = Sequenced<u64>>,
+    {
+        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut events = pin!(events);

-        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
-        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
+        // Are we waiting to be upscaled? Could be true if we request upscale due
+        // to a memory.high event and it does not arrive in time.
+        let mut waiting_on_upscale = false;

-        let mem_controller = self.memory()?;
+        loop {
+            tokio::select! {
+                upscale = upscales.recv() => {
+                    let Sequenced { seqnum, data } = upscale
+                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
+                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+                }
+                event = events.next() => {
+                    let Some(Sequenced { seqnum, .. }) = event else {
+                        bail!("failed to listen for memory.high events")
+                    };
+                    // The memory.high came before our last upscale, so we consider
+                    // it resolved
+                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
+                        info!(
+                            "received memory.high event, but it came before our last upscale -> ignoring it"
+                        );
+                        continue;
+                    }

-        // buffer for samples that will be logged. once full, it remains so.
-        let history_log_len = self.config.memory_history_log_interval;
-        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+                    // The memory.high came after our latest upscale. We don't
+                    // want to do anything yet, so peek the next event in hopes
+                    // that it's an upscale.
+                    if let Some(upscale_num) = self
+                        .upscaled(&mut upscales)
+                        .context("failed to check if we were upscaled")?
+                    {
+                        if upscale_num > seqnum {
+                            info!(
+                                "received memory.high event, but it came before our last upscale -> ignoring it"
+                            );
+                            continue;
+                        }
+                    }

-        for t in 0_u64.. {
-            ticker.tick().await;
+                    // If it's been long enough since we last froze, freeze the
+                    // cgroup and request upscale
+                    if wait_to_freeze.is_elapsed() {
+                        info!("received memory.high event -> requesting upscale");
+                        waiting_on_upscale = self
+                            .handle_memory_high_event(&mut upscales)
+                            .await
+                            .context("failed to handle upscale")?;
+                        wait_to_freeze
+                            .as_mut()
+                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
+                        continue;
+                    }

-            let now = Instant::now();
-            let mem = Self::memory_usage(mem_controller);
+                    // Ok, we can't freeze, just request upscale
+                    if !waiting_on_upscale {
+                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");

-            let i = t as usize % history_log_len;
-            history_log_buf[i] = mem;
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to request upscaling because we got upscaled");
+                            continue;
+                        }
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
+                        continue;
+                    }

-            // We're taking *at most* memory_history_len values; we may be bounded by the total
-            // number of samples that have come in so far.
-            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
-            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
-            // that we just inserted a value there, so the end of the iterator will *include* the
-            // value at i, rather than stopping just short of it.
-            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+                    // Shoot, we can't freeze or and we're still waiting on upscale,
+                    // increase memory.high to reduce throttling
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
+                        info!(
+                            "received memory.high event, \
+                            but too soon to refreeze and already requested upscale \
+                            -> increasing memory.high"
+                        );

-            let summary = MemoryHistory {
-                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
-                    / samples_count as u64,
-                samples_count,
-                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+                        // Make check to make sure we haven't been upscaled in the
+                        // meantine (can happen if the agent independently decides
+                        // to upscale us again)
+                        if self
+                            .upscaled(&mut upscales)
+                            .context("failed to check if we were upscaled")?
+                            .is_some()
+                        {
+                            info!("no need to increase memory.high because got upscaled");
+                            continue;
+                        }
+
+                        // Request upscale anyways (the agent will handle deduplicating
+                        // requests)
+                        self.upscale_requester
+                            .send(())
+                            .await
+                            .context("failed to request upscale")?;
+
+                        let memory_high =
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
+                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
+                        info!(
+                            current_high_bytes = memory_high,
+                            new_high_bytes = new_high,
+                            "updating memory.high"
+                        );
+                        self.set_memory_high_bytes(new_high)
+                            .context("failed to set memory.high")?;
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
+                    }
+
+                    info!("received memory.high event, but can't do anything");
+                }
            };
-
-            // Log the current history if it's time to do so. Because `history_log_buf` has length
-            // equal to the logging interval, we can just log the entire buffer every time we set
-            // the last entry, which also means that for this log line, we can ignore that it's a
-            // ring buffer (because all the entries are in order of increasing time).
-            if i == history_log_len - 1 {
-                info!(
-                    history = ?MemoryStatus::debug_slice(&history_log_buf),
-                    summary = ?summary,
-                    "Recent cgroup memory statistics history"
-                );
-            }
-
-            updates
-                .send((now, summary))
-                .context("failed to send MemoryHistory")?;
        }
+    }

-        unreachable!()
+    /// Handle a `memory.high`, returning whether we are still waiting on upscale
+    /// by the time the function returns.
+    ///
+    /// The general plan for handling a `memory.high` event is as follows:
+    /// 1. Freeze the cgroup
+    /// 2. Start a timer for `self.config.max_upscale_wait`
+    /// 3. Request upscale
+    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
+    /// 5. Return whether or not we are still waiting for upscale. If we are,
+    ///    we'll increase the cgroups memory.high to avoid getting oom killed
+    #[tracing::instrument(skip_all)]
+    async fn handle_memory_high_event(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<bool> {
+        // Immediately freeze the cgroup before doing anything else.
+        info!("received memory.high event -> freezing cgroup");
+        self.freeze().context("failed to freeze cgroup")?;
+
+        // We'll use this for logging durations
+        let start_time = Instant::now();
+
+        // Await the upscale until we have to unfreeze
+        let timed =
+            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
+
+        // Request the upscale
+        info!(
+            wait = ?self.config.max_upscale_wait,
+            "sending request for immediate upscaling",
+        );
+        self.upscale_requester
+            .send(())
+            .await
+            .context("failed to request upscale")?;
+
+        let waiting_on_upscale = match timed.await {
+            Ok(Ok(())) => {
+                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
+                false
+            }
+            // **important**: unfreeze the cgroup before ?-reporting the error
+            Ok(Err(e)) => {
+                info!("error waiting for upscale -> thawing cgroup");
+                self.thaw()
+                    .context("failed to thaw cgroup after errored waiting for upscale")?;
+                Err(e.context("failed to await upscale"))?
+            }
+            Err(_) => {
+                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
+                true
+            }
+        };
+
+        info!("thawing cgroup");
+        self.thaw().context("failed to thaw cgroup")?;
+
+        Ok(waiting_on_upscale)
+    }
+
+    /// Checks whether we were just upscaled, returning the upscale's sequence
+    /// number if so.
+    #[tracing::instrument(skip_all)]
+    fn upscaled(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<Option<u64>> {
+        let Sequenced { seqnum, data } = match upscales.try_recv() {
+            Ok(upscale) => upscale,
+            Err(TryRecvError::Empty) => return Ok(None),
+            Err(TryRecvError::Disconnected) => {
+                bail!("upscale notification channel was disconnected")
+            }
+        };
+
+        // Make sure to update the last upscale sequence number
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
+        Ok(Some(seqnum))
+    }
+
+    /// Await an upscale event, discarding any `memory.high` events received in
+    /// the process.
+    ///
+    /// This is used in `handle_memory_high_event`, where we need to listen
+    /// for upscales in particular so we know if we can thaw the cgroup early.
+    #[tracing::instrument(skip_all)]
+    async fn await_upscale(
+        &self,
+        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+    ) -> anyhow::Result<()> {
+        let Sequenced { seqnum, .. } = upscales
+            .recv()
+            .await
+            .context("error listening for upscales")?;
+
+        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
+        Ok(())
+    }
+
+    /// Get the cgroup's name.
+    pub fn path(&self) -> &str {
+        self.cgroup.path()
+    }
+}
+
+// Methods for manipulating the actual cgroup
+impl CgroupWatcher {
+    /// Get a handle on the freezer subsystem.
+    fn freezer(&self) -> anyhow::Result<&FreezerController> {
+        if let Some(Freezer(freezer)) = self
+            .cgroup
+            .subsystems()
+            .iter()
+            .find(|sub| matches!(sub, Freezer(_)))
+        {
+            Ok(freezer)
+        } else {
+            anyhow::bail!("could not find freezer subsystem")
+        }
+    }
+
+    /// Attempt to freeze the cgroup.
+    pub fn freeze(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .freeze()
+            .context("failed to freeze")
+    }
+
+    /// Attempt to thaw the cgroup.
+    pub fn thaw(&self) -> anyhow::Result<()> {
+        self.freezer()
+            .context("failed to get freezer subsystem")?
+            .thaw()
+            .context("failed to thaw")
    }

    /// Get a handle on the memory subsystem.
+    ///
+    /// Note: this method does not require `self.memory_update_lock` because
+    /// getting a handle to the subsystem does not access any of the files we
+    /// care about, such as memory.high and memory.events
    fn memory(&self) -> anyhow::Result<&MemController> {
-        self.cgroup
+        if let Some(Mem(memory)) = self
+            .cgroup
            .subsystems()
            .iter()
-            .find_map(|sub| match sub {
-                Subsystem::Mem(c) => Some(c),
-                _ => None,
+            .find(|sub| matches!(sub, Mem(_)))
+        {
+            Ok(memory)
+        } else {
+            anyhow::bail!("could not find memory subsystem")
+        }
+    }
+
+    /// Get cgroup current memory usage.
+    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
+        Ok(self
+            .memory()
+            .context("failed to get memory subsystem")?
+            .memory_stat()
+            .usage_in_bytes)
+    }
+
+    /// Set cgroup memory.high threshold.
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
+        self.memory()
+            .context("failed to get memory subsystem")?
+            .set_mem(cgroups_rs::memory::SetMemory {
+                low: None,
+                high: Some(value),
+                min: None,
+                max: None,
            })
-            .ok_or_else(|| anyhow!("could not find memory subsystem"))
+            .map_err(anyhow::Error::from)
    }

-    /// Given a handle on the memory subsystem, returns the current memory information
-    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
-        let stat = mem_controller.memory_stat().stat;
-        MemoryStatus {
-            non_reclaimable: stat.active_anon + stat.inactive_anon,
+    /// Get memory.high threshold.
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
+        let high = self
+            .memory()
+            .context("failed to get memory subsystem while getting memory statistics")?
+            .get_mem()
+            .map(|mem| mem.high)
+            .context("failed to get memory statistics from subsystem")?;
+        match high {
+            Some(MaxValue::Max) => Ok(i64::MAX as u64),
+            Some(MaxValue::Value(high)) => Ok(high as u64),
+            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
        }
    }
 }
-
-// Helper function for `CgroupWatcher::watch`
-fn ring_buf_recent_values_iter<T>(
-    buf: &[T],
-    last_value_idx: usize,
-    count: usize,
-) -> impl '_ + Iterator<Item = &T> {
-    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
-    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
-    assert!(count <= buf.len());
-
-    buf.iter()
-        // 'cycle' because the values could wrap around
-        .cycle()
-        // with 'cycle', this skip is more like 'offset', and functionally this is
-        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
-        // careful to avoid underflow, so we pre-add buf.len().
-        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
-        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
-        .take(count)
-}
-
-/// Summary of recent memory usage
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryHistory {
-    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
-    pub avg_non_reclaimable: u64,
-
-    /// The number of samples used to construct this summary
-    pub samples_count: usize,
-    /// Total timespan between the first and last sample used for this summary
-    pub samples_span: Duration,
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct MemoryStatus {
-    non_reclaimable: u64,
-}
-
-impl MemoryStatus {
-    fn zeroed() -> Self {
-        MemoryStatus { non_reclaimable: 0 }
-    }
-
-    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
-        struct DS<'a>(&'a [MemoryStatus]);
-
-        impl<'a> Debug for DS<'a> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_struct("[MemoryStatus]")
-                    .field(
-                        "non_reclaimable[..]",
-                        &Fields(self.0, |stat: &MemoryStatus| {
-                            BytesToGB(stat.non_reclaimable)
-                        }),
-                    )
-                    .finish()
-            }
-        }
-
-        struct Fields<'a, F>(&'a [MemoryStatus], F);
-
-        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
-            }
-        }
-
-        struct BytesToGB(u64);
-
-        impl Debug for BytesToGB {
-            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
-                f.write_fmt(format_args!(
-                    "{:.3}Gi",
-                    self.0 as f64 / (1_u64 << 30) as f64
-                ))
-            }
-        }
-
-        DS(slice)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn ring_buf_iter() {
-        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
-
-        let values = |offset, count| {
-            super::ring_buf_recent_values_iter(&buf, offset, count)
-                .copied()
-                .collect::<Vec<i32>>()
-        };
-
-        // Boundary conditions: start, end, and entire thing:
-        assert_eq!(values(0, 1), [0]);
-        assert_eq!(values(3, 4), [0, 1, 2, 3]);
-        assert_eq!(values(9, 4), [6, 7, 8, 9]);
-        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
-
-        // "normal" operation: no wraparound
-        assert_eq!(values(7, 4), [4, 5, 6, 7]);
-
-        // wraparound:
-        assert_eq!(values(0, 4), [7, 8, 9, 0]);
-        assert_eq!(values(1, 4), [8, 9, 0, 1]);
-        assert_eq!(values(2, 4), [9, 0, 1, 2]);
-        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
-    }
-}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,10 +12,12 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
+use tokio::sync::mpsc;
 use tracing::info;

+use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
    PROTOCOL_MIN_VERSION,
 };

@@ -34,6 +36,13 @@ pub struct Dispatcher {
    /// We send messages to the agent through `sink`
    sink: SplitSink<WebSocket, Message>,

+    /// Used to notify the cgroup when we are upscaled.
+    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+
+    /// When the cgroup requests upscale it will send on this channel. In response
+    /// we send an `UpscaleRequst` to the agent.
+    pub(crate) request_upscale_events: mpsc::Receiver<()>,
+
    /// The protocol version we have agreed to use with the agent. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
@@ -52,7 +61,11 @@ impl Dispatcher {
    /// 1. Wait for the agent to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
-    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
+    pub async fn new(
+        stream: WebSocket,
+        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
+        request_upscale_events: mpsc::Receiver<()>,
+    ) -> anyhow::Result<Self> {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
@@ -106,10 +119,22 @@ impl Dispatcher {
        Ok(Self {
            sink,
            source,
+            notify_upscale_events,
+            request_upscale_events,
            proto_version: highest_shared_version,
        })
    }

+    /// Notify the cgroup manager that we have received upscale and wait for
+    /// the acknowledgement.
+    #[tracing::instrument(skip_all, fields(?resources))]
+    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
+        self.notify_upscale_events
+            .send(resources)
+            .await
+            .context("failed to send resources and oneshot sender across channel")
+    }
+
    /// Send a message to the agent.
    ///
    /// Although this function is small, it has one major benefit: it is the only
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,16 +5,18 @@
 //! all functionality.

 use std::fmt::Debug;
+use std::sync::Arc;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::{broadcast, watch};
+use tokio::sync::broadcast;
+use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{self, CgroupWatcher};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -26,7 +28,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
    config: Config,
    filecache: Option<FileCacheState>,
-    cgroup: Option<CgroupState>,
+    cgroup: Option<Arc<CgroupWatcher>>,
    dispatcher: Dispatcher,

    /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -43,14 +45,6 @@ pub struct Runner {
    kill: broadcast::Receiver<()>,
 }

-#[derive(Debug)]
-struct CgroupState {
-    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
-    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
-    /// requests.
-    threshold: u64,
-}
-
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -68,56 +62,16 @@ pub struct Config {
    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
-
-    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
-    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
-    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
-    /// threshold.
-    ///
-    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
-    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
-    /// memory.
-    ///
-    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
-    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
-    /// memory for the file cache).
-    cgroup_min_overhead_fraction: f64,
-
-    cgroup_downscale_threshold_buffer_bytes: u64,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
            sys_buffer_bytes: 100 * MiB,
-            cgroup_min_overhead_fraction: 0.15,
-            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
        }
    }
 }

-impl Config {
-    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
-        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
-        // and thus be non-reclaimable, so we should allow for additional memory usage.
-        //
-        // If the file cache sits on disk, our desired stable system state is for it to be fully
-        // page cached (its contents should only be paged to/from disk in situations where we can't
-        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
-        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
-        // out the file cache.
-        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
-
-        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
-        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
-        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
-        // remaining above the threshold.
-        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
-
-        memory_remaining_for_cgroup.min(max_threshold)
-    }
-}
-
 impl Runner {
    /// Create a new monitor.
    #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -133,7 +87,12 @@ impl Runner {
            "invalid monitor Config: sys_buffer_bytes cannot be 0"
        );

-        let dispatcher = Dispatcher::new(ws)
+        // *NOTE*: the dispatcher and cgroup manager talk through these channels
+        // so make sure they each get the correct half, nothing is droppped, etc.
+        let (notified_send, notified_recv) = mpsc::channel(1);
+        let (requesting_send, requesting_recv) = mpsc::channel(1);
+
+        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
            .await
            .context("error creating new dispatcher")?;

@@ -147,9 +106,45 @@ impl Runner {
            kill,
        };

-        let mem = get_total_system_memory();
+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");

-        let mut file_cache_disk_size = 0;
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        }
+
+        let mut file_cache_reserved_bytes = 0;
+        let mem = get_total_system_memory();

        // We need to process file cache initialization before cgroup initialization, so that the memory
        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
@@ -161,7 +156,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                .await
                .context("failed to create file cache")?;

@@ -186,40 +181,23 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-
-            if args.file_cache_on_disk {
-                file_cache_disk_size = actual_size;
+            // Mark the resources given to the file cache as reserved, but only if it's in memory.
+            if !args.file_cache_on_disk {
+                file_cache_reserved_bytes = actual_size;
            }

            state.filecache = Some(file_cache);
        }

-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
+        if let Some(cgroup) = &state.cgroup {
+            let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);

-            let cgroup =
-                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
+            info!(value, "setting memory.high");

-            let init_value = cgroup::MemoryHistory {
-                avg_non_reclaimable: 0,
-                samples_count: 0,
-                samples_span: Duration::ZERO,
-            };
-            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup.watch(hist_tx).await
-            });
-
-            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
-            info!(threshold, "set initial cgroup threshold",);
-
-            state.cgroup = Some(CgroupState {
-                watcher: hist_rx,
-                threshold,
-            });
+            cgroup
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(state)
@@ -239,51 +217,28 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let (expected_file_cache_size, expected_file_cache_disk_size) = self
+        let expected_file_cache_mem_usage = self
            .filecache
            .as_ref()
-            .map(|file_cache| {
-                let size = file_cache.config.calculate_cache_size(usable_system_memory);
-                match file_cache.config.in_memory {
-                    true => (size, 0),
-                    false => (size, size),
-                }
-            })
-            .unwrap_or((0, 0));
+            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
+            .unwrap_or(0);
+        let mut new_cgroup_mem_high = 0;
        if let Some(cgroup) = &self.cgroup {
-            let (last_time, last_history) = *cgroup.watcher.borrow();
-
-            // NB: The ordering of these conditions is intentional. During startup, we should deny
-            // downscaling until we have enough information to determine that it's safe to do so
-            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
-            // received any information, we should *fail* instead of just denying downscaling.
-            //
-            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
-            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
-            // OR if we haven't received metrics *recently enough*.
-            //
-            // TODO: make the duration here configurable.
-            if last_time.elapsed() > Duration::from_secs(5) {
-                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
-            } else if last_history.samples_count <= 1 {
-                let status = "haven't received enough cgroup memory stats yet";
-                info!(status, "discontinuing downscale");
-                return Ok((false, status.to_owned()));
-            }
-
-            let new_threshold = self
+            new_cgroup_mem_high = cgroup
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
+                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);

-            let current = last_history.avg_non_reclaimable;
+            let current = cgroup
+                .current_memory_usage()
+                .context("failed to fetch cgroup memory")?;

-            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
+            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
                let status = format!(
-                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
-                    "calculated memory threshold too low",
-                    bytes_to_mebibytes(new_threshold),
+                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
+                    "calculated memory.high too low",
+                    bytes_to_mebibytes(new_cgroup_mem_high),
                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
+                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
                );

                info!(status, "discontinuing downscale");
@@ -294,14 +249,14 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
-        let mut file_cache_disk_size = 0;
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_size)
+                .set_file_cache_size(expected_file_cache_mem_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }
            let message = format!(
                "set file cache size to {} MiB (in memory = {})",
@@ -312,18 +267,24 @@ impl Runner {
            status.push(message);
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+
+            if file_cache_mem_usage != expected_file_cache_mem_usage {
+                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+            }
+
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;

            let message = format!(
-                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                "set cgroup memory.high to {} MiB, of new max {} MiB",
+                bytes_to_mebibytes(new_cgroup_mem_high),
+                bytes_to_mebibytes(available_memory)
            );
-            cgroup.threshold = new_threshold;
            info!("downscale: {message}");
            status.push(message);
        }
@@ -344,7 +305,8 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

-        let mut file_cache_disk_size = 0;
+        // Get the file cache's expected contribution to the memory usage
+        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -357,8 +319,8 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if !file_cache.config.in_memory {
-                file_cache_disk_size = actual_usage;
+            if file_cache.config.in_memory {
+                file_cache_mem_usage = actual_usage;
            }

            if actual_usage != expected_usage {
@@ -370,18 +332,18 @@ impl Runner {
            }
        }

-        if let Some(cgroup) = &mut self.cgroup {
-            let new_threshold = self
-                .config
-                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
-
+        if let Some(cgroup) = &self.cgroup {
+            let available_memory = usable_system_memory - file_cache_mem_usage;
+            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            info!(
-                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
-                bytes_to_mebibytes(cgroup.threshold),
-                bytes_to_mebibytes(new_threshold),
-                bytes_to_mebibytes(usable_system_memory)
+                target = bytes_to_mebibytes(new_cgroup_mem_high),
+                total = bytes_to_mebibytes(new_mem),
+                name = cgroup.path(),
+                "updating cgroup memory.high",
            );
-            cgroup.threshold = new_threshold;
+            cgroup
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(())
@@ -399,6 +361,10 @@ impl Runner {
                self.handle_upscale(granted)
                    .await
                    .context("failed to handle upscale")?;
+                self.dispatcher
+                    .notify_upscale(Sequenced::new(granted))
+                    .await
+                    .context("failed to notify notify cgroup of upscale")?;
                Ok(Some(OutboundMsg::new(
                    OutboundMsgKind::UpscaleConfirmation {},
                    id,
@@ -442,53 +408,33 @@ impl Runner {
                        Err(e) => bail!("failed to receive kill signal: {e}")
                    }
                }
-
-                // New memory stats from the cgroup, *may* need to request upscaling, if we've
-                // exceeded the threshold
-                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
-                    result.context("failed to receive from cgroup memory stats watcher")?;
-
-                    let cgroup = self.cgroup.as_ref().unwrap();
-
-                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
-
-                    // If we haven't exceeded the threshold, then we're all ok
-                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
-                        continue;
+                // we need to propagate an upscale request
+                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
+                    if request.is_none() {
+                        bail!("failed to listen for upscale event from cgroup")
                    }

-                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
-                    // since the last time we requested upscaling, ignore the event, to avoid
-                    // spamming the agent.
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(
-                                elapsed_millis = elapsed.as_millis(),
-                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                                threshold = bytes_to_mebibytes(cgroup.threshold),
-                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
-                            );
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
                            continue;
                        }
                    }

                    self.last_upscale_request_at = Some(Instant::now());

-                    info!(
-                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                        threshold = bytes_to_mebibytes(cgroup.threshold),
-                        "cgroup memory stats are high enough to upscale, requesting upscale",
-                    );
-
+                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
                    self.dispatcher
                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                        .await
                        .context("failed to send message")?;
-                },
-
+                }
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
@@ -516,14 +462,11 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        // use {:#} for our logging because the display impl only
-                                        // gives the outermost cause, and the debug impl
-                                        // pretty-prints the error, whereas {:#} contains all the
-                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        let error = e.to_string();
+                                        warn!(?error, "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error: e.to_string(),
+                                                error
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -1,16 +0,0 @@
-[package]
-name = "walproposer"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-anyhow.workspace = true
-utils.workspace = true
-postgres_ffi.workspace = true
-
-workspace_hack.workspace = true
-
-[build-dependencies]
-anyhow.workspace = true
-bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +0,0 @@
-#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -1,113 +0,0 @@
-use std::{env, path::PathBuf, process::Command};
-
-use anyhow::{anyhow, Context};
-use bindgen::CargoCallbacks;
-
-fn main() -> anyhow::Result<()> {
-    // Tell cargo to invalidate the built crate whenever the wrapper changes
-    println!("cargo:rerun-if-changed=bindgen_deps.h");
-
-    // Finding the location of built libraries and Postgres C headers:
-    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
-    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
-    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
-        postgres_install_dir.into()
-    } else {
-        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
-    };
-
-    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
-    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
-    let walproposer_lib_search_str = walproposer_lib_dir
-        .to_str()
-        .ok_or(anyhow!("Bad non-UTF path"))?;
-
-    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
-    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
-    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
-
-    println!("cargo:rustc-link-lib=static=pgport");
-    println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
-    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
-
-    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
-    let inc_server_path: String = if pg_config_bin.exists() {
-        let output = Command::new(pg_config_bin)
-            .arg("--includedir-server")
-            .output()
-            .context("failed to execute `pg_config --includedir-server`")?;
-
-        if !output.status.success() {
-            panic!("`pg_config --includedir-server` failed")
-        }
-
-        String::from_utf8(output.stdout)
-            .context("pg_config output is not UTF-8")?
-            .trim_end()
-            .into()
-    } else {
-        let server_path = pg_install_abs
-            .join("v16")
-            .join("include")
-            .join("postgresql")
-            .join("server")
-            .into_os_string();
-        server_path
-            .into_string()
-            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
-    };
-
-    // The bindgen::Builder is the main entry point
-    // to bindgen, and lets you build up options for
-    // the resulting bindings.
-    let bindings = bindgen::Builder::default()
-        // The input header we would like to generate
-        // bindings for.
-        .header("bindgen_deps.h")
-        // Tell cargo to invalidate the built crate whenever any of the
-        // included header files changed.
-        .parse_callbacks(Box::new(CargoCallbacks))
-        .allowlist_type("WalProposer")
-        .allowlist_type("WalProposerConfig")
-        .allowlist_type("walproposer_api")
-        .allowlist_function("WalProposerCreate")
-        .allowlist_function("WalProposerStart")
-        .allowlist_function("WalProposerBroadcast")
-        .allowlist_function("WalProposerPoll")
-        .allowlist_function("WalProposerFree")
-        .allowlist_var("DEBUG5")
-        .allowlist_var("DEBUG4")
-        .allowlist_var("DEBUG3")
-        .allowlist_var("DEBUG2")
-        .allowlist_var("DEBUG1")
-        .allowlist_var("LOG")
-        .allowlist_var("INFO")
-        .allowlist_var("NOTICE")
-        .allowlist_var("WARNING")
-        .allowlist_var("ERROR")
-        .allowlist_var("FATAL")
-        .allowlist_var("PANIC")
-        .allowlist_var("WPEVENT")
-        .allowlist_var("WL_LATCH_SET")
-        .allowlist_var("WL_SOCKET_READABLE")
-        .allowlist_var("WL_SOCKET_WRITEABLE")
-        .allowlist_var("WL_TIMEOUT")
-        .allowlist_var("WL_SOCKET_CLOSED")
-        .allowlist_var("WL_SOCKET_MASK")
-        .clang_arg("-DWALPROPOSER_LIB")
-        .clang_arg(format!("-I{pgxn_neon}"))
-        .clang_arg(format!("-I{inc_server_path}"))
-        // Finish the builder and generate the bindings.
-        .generate()
-        // Unwrap the Result and panic on failure.
-        .expect("Unable to generate bindings");
-
-    // Write the bindings to the $OUT_DIR/bindings.rs file.
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
-    bindings
-        .write_to_file(out_path)
-        .expect("Couldn't write bindings!");
-
-    Ok(())
-}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -1,455 +0,0 @@
-#![allow(dead_code)]
-
-use std::ffi::CStr;
-use std::ffi::CString;
-
-use crate::bindings::uint32;
-use crate::bindings::walproposer_api;
-use crate::bindings::PGAsyncReadResult;
-use crate::bindings::PGAsyncWriteResult;
-use crate::bindings::Safekeeper;
-use crate::bindings::Size;
-use crate::bindings::StringInfoData;
-use crate::bindings::TimeLineID;
-use crate::bindings::TimestampTz;
-use crate::bindings::WalProposer;
-use crate::bindings::WalProposerConnStatusType;
-use crate::bindings::WalProposerConnectPollStatusType;
-use crate::bindings::WalProposerExecStatusType;
-use crate::bindings::WalproposerShmemState;
-use crate::bindings::XLogRecPtr;
-use crate::walproposer::ApiImpl;
-use crate::walproposer::WaitResult;
-
-extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_shmem_state()
-    }
-}
-
-extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).start_streaming(startpos)
-    }
-}
-
-extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_flush_rec_ptr()
-    }
-}
-
-extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_current_timestamp()
-    }
-}
-
-extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let msg = (*api).conn_error_message(&mut (*sk));
-        let msg = CString::new(msg).unwrap();
-        // TODO: fix leaking error message
-        msg.into_raw()
-    }
-}
-
-extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_status(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_connect_start(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_connect_poll(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
-    let query = unsafe { CStr::from_ptr(query) };
-    let query = query.to_str().unwrap();
-
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_send_query(&mut (*sk), query)
-    }
-}
-
-extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_get_query_result(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_flush(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_finish(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_finish(&mut (*sk))
-    }
-}
-
-extern "C" fn conn_async_read(
-    sk: *mut Safekeeper,
-    buf: *mut *mut ::std::os::raw::c_char,
-    amount: *mut ::std::os::raw::c_int,
-) -> PGAsyncReadResult {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let (res, result) = (*api).conn_async_read(&mut (*sk));
-
-        // This function has guarantee that returned buf will be valid until
-        // the next call. So we can store a Vec in each Safekeeper and reuse
-        // it on the next call.
-        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
-
-        inbuf.clear();
-        inbuf.extend_from_slice(res);
-
-        // Put a Vec back to sk->inbuf and return data ptr.
-        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
-        *amount = res.len() as i32;
-
-        result
-    }
-}
-
-extern "C" fn conn_async_write(
-    sk: *mut Safekeeper,
-    buf: *const ::std::os::raw::c_void,
-    size: usize,
-) -> PGAsyncWriteResult {
-    unsafe {
-        let buf = std::slice::from_raw_parts(buf as *const u8, size);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_async_write(&mut (*sk), buf)
-    }
-}
-
-extern "C" fn conn_blocking_write(
-    sk: *mut Safekeeper,
-    buf: *const ::std::os::raw::c_void,
-    size: usize,
-) -> bool {
-    unsafe {
-        let buf = std::slice::from_raw_parts(buf as *const u8, size);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).conn_blocking_write(&mut (*sk), buf)
-    }
-}
-
-extern "C" fn recovery_download(
-    sk: *mut Safekeeper,
-    _timeline: TimeLineID,
-    startpos: XLogRecPtr,
-    endpos: XLogRecPtr,
-) -> bool {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*sk), startpos, endpos)
-    }
-}
-
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-) {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
-extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_reader_allocate(&mut (*sk));
-    }
-}
-
-extern "C" fn free_event_set(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).free_event_set(&mut (*wp));
-    }
-}
-
-extern "C" fn init_event_set(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).init_event_set(&mut (*wp));
-    }
-}
-
-extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).update_event_set(&mut (*sk), events);
-    }
-}
-
-extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).add_safekeeper_event_set(&mut (*sk), events);
-    }
-}
-
-extern "C" fn wait_event_set(
-    wp: *mut WalProposer,
-    timeout: ::std::os::raw::c_long,
-    event_sk: *mut *mut Safekeeper,
-    events: *mut uint32,
-) -> ::std::os::raw::c_int {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let result = (*api).wait_event_set(&mut (*wp), timeout);
-        match result {
-            WaitResult::Latch => {
-                *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_LATCH_SET;
-                1
-            }
-            WaitResult::Timeout => {
-                *event_sk = std::ptr::null_mut();
-                *events = crate::bindings::WL_TIMEOUT;
-                0
-            }
-            WaitResult::Network(sk, event_mask) => {
-                *event_sk = sk;
-                *events = event_mask;
-                1
-            }
-        }
-    }
-}
-
-extern "C" fn strong_random(
-    wp: *mut WalProposer,
-    buf: *mut ::std::os::raw::c_void,
-    len: usize,
-) -> bool {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).strong_random(buf)
-    }
-}
-
-extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).get_redo_start_lsn()
-    }
-}
-
-extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).finish_sync_safekeepers(lsn)
-    }
-}
-
-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
-    }
-}
-
-extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).confirm_wal_streamed(&mut (*wp), lsn)
-    }
-}
-
-extern "C" fn log_internal(
-    wp: *mut WalProposer,
-    level: ::std::os::raw::c_int,
-    line: *const ::std::os::raw::c_char,
-) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        let line = CStr::from_ptr(line);
-        let line = line.to_str().unwrap();
-        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
-    }
-}
-
-extern "C" fn after_election(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).after_election(&mut (*wp))
-    }
-}
-
-#[derive(Debug)]
-pub enum Level {
-    Debug5,
-    Debug4,
-    Debug3,
-    Debug2,
-    Debug1,
-    Log,
-    Info,
-    Notice,
-    Warning,
-    Error,
-    Fatal,
-    Panic,
-    WPEvent,
-}
-
-impl Level {
-    pub fn from(elevel: u32) -> Level {
-        use crate::bindings::*;
-
-        match elevel {
-            DEBUG5 => Level::Debug5,
-            DEBUG4 => Level::Debug4,
-            DEBUG3 => Level::Debug3,
-            DEBUG2 => Level::Debug2,
-            DEBUG1 => Level::Debug1,
-            LOG => Level::Log,
-            INFO => Level::Info,
-            NOTICE => Level::Notice,
-            WARNING => Level::Warning,
-            ERROR => Level::Error,
-            FATAL => Level::Fatal,
-            PANIC => Level::Panic,
-            WPEVENT => Level::WPEvent,
-            _ => panic!("unknown log level {}", elevel),
-        }
-    }
-}
-
-pub(crate) fn create_api() -> walproposer_api {
-    walproposer_api {
-        get_shmem_state: Some(get_shmem_state),
-        start_streaming: Some(start_streaming),
-        get_flush_rec_ptr: Some(get_flush_rec_ptr),
-        get_current_timestamp: Some(get_current_timestamp),
-        conn_error_message: Some(conn_error_message),
-        conn_status: Some(conn_status),
-        conn_connect_start: Some(conn_connect_start),
-        conn_connect_poll: Some(conn_connect_poll),
-        conn_send_query: Some(conn_send_query),
-        conn_get_query_result: Some(conn_get_query_result),
-        conn_flush: Some(conn_flush),
-        conn_finish: Some(conn_finish),
-        conn_async_read: Some(conn_async_read),
-        conn_async_write: Some(conn_async_write),
-        conn_blocking_write: Some(conn_blocking_write),
-        recovery_download: Some(recovery_download),
-        wal_read: Some(wal_read),
-        wal_reader_allocate: Some(wal_reader_allocate),
-        free_event_set: Some(free_event_set),
-        init_event_set: Some(init_event_set),
-        update_event_set: Some(update_event_set),
-        add_safekeeper_event_set: Some(add_safekeeper_event_set),
-        wait_event_set: Some(wait_event_set),
-        strong_random: Some(strong_random),
-        get_redo_start_lsn: Some(get_redo_start_lsn),
-        finish_sync_safekeepers: Some(finish_sync_safekeepers),
-        process_safekeeper_feedback: Some(process_safekeeper_feedback),
-        confirm_wal_streamed: Some(confirm_wal_streamed),
-        log_internal: Some(log_internal),
-        after_election: Some(after_election),
-    }
-}
-
-impl std::fmt::Display for Level {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{:?}", self)
-    }
-}
-
-/// Take ownership of `Vec<u8>` from StringInfoData.
-pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
-    if pg.data.is_null() {
-        return None;
-    }
-
-    let ptr = pg.data as *mut u8;
-    let length = pg.len as usize;
-    let capacity = pg.maxlen as usize;
-
-    pg.data = std::ptr::null_mut();
-    pg.len = 0;
-    pg.maxlen = 0;
-
-    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
-}
-
-/// Store `Vec<u8>` in StringInfoData.
-fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
-    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
-    let length = vec.len();
-    let capacity = vec.capacity();
-
-    assert!(pg.data.is_null());
-
-    pg.data = ptr;
-    pg.len = length as i32;
-    pg.maxlen = capacity as i32;
-
-    std::mem::forget(vec);
-
-    ptr
-}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -1,14 +0,0 @@
-pub mod bindings {
-    #![allow(non_upper_case_globals)]
-    #![allow(non_camel_case_types)]
-    #![allow(non_snake_case)]
-    // bindgen creates some unsafe code with no doc comments.
-    #![allow(clippy::missing_safety_doc)]
-    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
-    #![allow(clippy::useless_transmute)]
-
-    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
-}
-
-pub mod api_bindings;
-pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,485 +0,0 @@
-use std::ffi::CString;
-
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::id::TenantTimelineId;
-
-use crate::{
-    api_bindings::{create_api, take_vec_u8, Level},
-    bindings::{
-        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
-        WalProposerStart,
-    },
-};
-
-/// Rust high-level wrapper for C walproposer API. Many methods are not required
-/// for simple cases, hence todo!() in default implementations.
-///
-/// Refer to `pgxn/neon/walproposer.h` for documentation.
-pub trait ApiImpl {
-    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
-        todo!()
-    }
-
-    fn start_streaming(&self, _startpos: u64) {
-        todo!()
-    }
-
-    fn get_flush_rec_ptr(&self) -> u64 {
-        todo!()
-    }
-
-    fn get_current_timestamp(&self) -> i64 {
-        todo!()
-    }
-
-    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
-        todo!()
-    }
-
-    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
-        todo!()
-    }
-
-    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn conn_connect_poll(
-        &self,
-        _sk: &mut Safekeeper,
-    ) -> crate::bindings::WalProposerConnectPollStatusType {
-        todo!()
-    }
-
-    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
-        todo!()
-    }
-
-    fn conn_get_query_result(
-        &self,
-        _sk: &mut Safekeeper,
-    ) -> crate::bindings::WalProposerExecStatusType {
-        todo!()
-    }
-
-    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
-        todo!()
-    }
-
-    fn conn_finish(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
-        todo!()
-    }
-
-    fn conn_async_write(
-        &self,
-        _sk: &mut Safekeeper,
-        _buf: &[u8],
-    ) -> crate::bindings::PGAsyncWriteResult {
-        todo!()
-    }
-
-    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
-        todo!()
-    }
-
-    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
-        todo!()
-    }
-
-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
-        todo!()
-    }
-
-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
-    fn free_event_set(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-
-    fn init_event_set(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-
-    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
-        todo!()
-    }
-
-    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
-        todo!()
-    }
-
-    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
-        todo!()
-    }
-
-    fn strong_random(&self, _buf: &mut [u8]) -> bool {
-        todo!()
-    }
-
-    fn get_redo_start_lsn(&self) -> u64 {
-        todo!()
-    }
-
-    fn finish_sync_safekeepers(&self, _lsn: u64) {
-        todo!()
-    }
-
-    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
-        todo!()
-    }
-
-    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
-        todo!()
-    }
-
-    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
-        todo!()
-    }
-
-    fn after_election(&self, _wp: &mut WalProposer) {
-        todo!()
-    }
-}
-
-pub enum WaitResult {
-    Latch,
-    Timeout,
-    Network(*mut Safekeeper, u32),
-}
-
-pub struct Config {
-    /// Tenant and timeline id
-    pub ttid: TenantTimelineId,
-    /// List of safekeepers in format `host:port`
-    pub safekeepers_list: Vec<String>,
-    /// Safekeeper reconnect timeout in milliseconds
-    pub safekeeper_reconnect_timeout: i32,
-    /// Safekeeper connection timeout in milliseconds
-    pub safekeeper_connection_timeout: i32,
-    /// walproposer mode, finish when all safekeepers are synced or subscribe
-    /// to WAL streaming
-    pub sync_safekeepers: bool,
-}
-
-/// WalProposer main struct. C methods are reexported as Rust functions.
-pub struct Wrapper {
-    wp: *mut WalProposer,
-    _safekeepers_list_vec: Vec<u8>,
-}
-
-impl Wrapper {
-    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
-        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
-            .unwrap()
-            .into_raw();
-        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
-            .unwrap()
-            .into_raw();
-
-        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
-            .unwrap()
-            .into_bytes_with_nul();
-        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
-
-        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
-
-        let c_config = WalProposerConfig {
-            neon_tenant,
-            neon_timeline,
-            safekeepers_list,
-            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
-            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
-            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
-            syncSafekeepers: config.sync_safekeepers,
-            systemId: 0,
-            pgTimeline: 1,
-            callback_data,
-        };
-        let c_config = Box::into_raw(Box::new(c_config));
-
-        let api = create_api();
-        let wp = unsafe { WalProposerCreate(c_config, api) };
-        Wrapper {
-            wp,
-            _safekeepers_list_vec: safekeepers_list_vec,
-        }
-    }
-
-    pub fn start(&self) {
-        unsafe { WalProposerStart(self.wp) }
-    }
-}
-
-impl Drop for Wrapper {
-    fn drop(&mut self) {
-        unsafe {
-            let config = (*self.wp).config;
-            drop(Box::from_raw(
-                (*config).callback_data as *mut Box<dyn ApiImpl>,
-            ));
-            drop(CString::from_raw((*config).neon_tenant));
-            drop(CString::from_raw((*config).neon_timeline));
-            drop(Box::from_raw(config));
-
-            for i in 0..(*self.wp).n_safekeepers {
-                let sk = &mut (*self.wp).safekeeper[i as usize];
-                take_vec_u8(&mut sk.inbuf);
-            }
-
-            WalProposerFree(self.wp);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{
-        cell::Cell,
-        sync::{atomic::AtomicUsize, mpsc::sync_channel},
-    };
-
-    use utils::id::TenantTimelineId;
-
-    use crate::{api_bindings::Level, walproposer::Wrapper};
-
-    use super::ApiImpl;
-
-    #[derive(Clone, Copy, Debug)]
-    struct WaitEventsData {
-        sk: *mut crate::bindings::Safekeeper,
-        event_mask: u32,
-    }
-
-    struct MockImpl {
-        // data to return from wait_event_set
-        wait_events: Cell<WaitEventsData>,
-        // walproposer->safekeeper messages
-        expected_messages: Vec<Vec<u8>>,
-        expected_ptr: AtomicUsize,
-        // safekeeper->walproposer messages
-        safekeeper_replies: Vec<Vec<u8>>,
-        replies_ptr: AtomicUsize,
-        // channel to send LSN to the main thread
-        sync_channel: std::sync::mpsc::SyncSender<u64>,
-    }
-
-    impl MockImpl {
-        fn check_walproposer_msg(&self, msg: &[u8]) {
-            let ptr = self
-                .expected_ptr
-                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-
-            if ptr >= self.expected_messages.len() {
-                panic!("unexpected message from walproposer");
-            }
-
-            let expected_msg = &self.expected_messages[ptr];
-            assert_eq!(msg, expected_msg.as_slice());
-        }
-
-        fn next_safekeeper_reply(&self) -> &[u8] {
-            let ptr = self
-                .replies_ptr
-                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-
-            if ptr >= self.safekeeper_replies.len() {
-                panic!("no more safekeeper replies");
-            }
-
-            &self.safekeeper_replies[ptr]
-        }
-    }
-
-    impl ApiImpl for MockImpl {
-        fn get_current_timestamp(&self) -> i64 {
-            println!("get_current_timestamp");
-            0
-        }
-
-        fn conn_status(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerConnStatusType {
-            println!("conn_status");
-            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
-        }
-
-        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("conn_connect_start");
-        }
-
-        fn conn_connect_poll(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerConnectPollStatusType {
-            println!("conn_connect_poll");
-            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
-        }
-
-        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
-            println!("conn_send_query: {}", query);
-            true
-        }
-
-        fn conn_get_query_result(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> crate::bindings::WalProposerExecStatusType {
-            println!("conn_get_query_result");
-            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
-        }
-
-        fn conn_async_read(
-            &self,
-            _: &mut crate::bindings::Safekeeper,
-        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
-            println!("conn_async_read");
-            let reply = self.next_safekeeper_reply();
-            println!("conn_async_read result: {:?}", reply);
-            (
-                reply,
-                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
-            )
-        }
-
-        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
-            println!("conn_blocking_write: {:?}", buf);
-            self.check_walproposer_msg(buf);
-            true
-        }
-
-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("wal_reader_allocate")
-        }
-
-        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("free_event_set")
-        }
-
-        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("init_event_set")
-        }
-
-        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
-            println!(
-                "update_event_set, sk={:?}, events_mask={:#b}",
-                sk as *mut crate::bindings::Safekeeper, event_mask
-            );
-            self.wait_events.set(WaitEventsData { sk, event_mask });
-        }
-
-        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
-            println!(
-                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
-                sk as *mut crate::bindings::Safekeeper, event_mask
-            );
-            self.wait_events.set(WaitEventsData { sk, event_mask });
-        }
-
-        fn wait_event_set(
-            &self,
-            _: &mut crate::bindings::WalProposer,
-            timeout_millis: i64,
-        ) -> super::WaitResult {
-            let data = self.wait_events.get();
-            println!(
-                "wait_event_set, timeout_millis={}, res={:?}",
-                timeout_millis, data
-            );
-            super::WaitResult::Network(data.sk, data.event_mask)
-        }
-
-        fn strong_random(&self, buf: &mut [u8]) -> bool {
-            println!("strong_random");
-            buf.fill(0);
-            true
-        }
-
-        fn finish_sync_safekeepers(&self, lsn: u64) {
-            self.sync_channel.send(lsn).unwrap();
-            panic!("sync safekeepers finished at lsn={}", lsn);
-        }
-
-        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
-        }
-
-        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
-            println!("after_election");
-        }
-    }
-
-    /// Test that walproposer can successfully connect to safekeeper and finish
-    /// sync_safekeepers. API is mocked in MockImpl.
-    ///
-    /// Run this test with valgrind to detect leaks:
-    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
-    #[test]
-    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
-        let ttid = TenantTimelineId::new(
-            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
-            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
-        );
-
-        let (sender, receiver) = sync_channel(1);
-
-        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
-            wait_events: Cell::new(WaitEventsData {
-                sk: std::ptr::null_mut(),
-                event_mask: 0,
-            }),
-            expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
-                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
-                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
-                ],
-                // VoteRequest(VoteRequest { term: 3 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0,
-                ],
-            ],
-            expected_ptr: AtomicUsize::new(0),
-            safekeeper_replies: vec![
-                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
-                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                ],
-                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
-                vec![
-                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
-                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
-                ],
-            ],
-            replies_ptr: AtomicUsize::new(0),
-            sync_channel: sender,
-        });
-        let config = crate::walproposer::Config {
-            ttid,
-            safekeepers_list: vec!["localhost:5000".to_string()],
-            safekeeper_reconnect_timeout: 1000,
-            safekeeper_connection_timeout: 10000,
-            sync_safekeepers: true,
-        };
-
-        let wp = Wrapper::new(my_impl, config);
-
-        // walproposer will panic when it finishes sync_safekeepers
-        std::panic::catch_unwind(|| wp.start()).unwrap_err();
-        // validate the resulting LSN
-        assert_eq!(receiver.recv()?, 1337);
-        Ok(())
-        // drop() will free up resources here
-    }
-}
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -32,15 +32,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let manager = Arc::new(manager);

-    {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        tracing::info!("executing first");
-        short().execute(rt.handle(), &manager).unwrap();
-        tracing::info!("first executed");
-    }
+    tracing::info!("executing first");
+    short().execute(&manager).unwrap();
+    tracing::info!("first executed");

    let thread_counts = [1, 2, 4, 8, 16];

@@ -83,14 +77,9 @@ fn add_multithreaded_walredo_requesters(
    assert_ne!(threads, 0);

    if threads == 1 {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let handle = rt.handle();
        b.iter_batched_ref(
            || Some(input_factory()),
-            |input| execute_all(input.take(), handle, manager),
+            |input| execute_all(input.take(), manager),
            criterion::BatchSize::PerIteration,
        );
    } else {
@@ -106,26 +95,19 @@ fn add_multithreaded_walredo_requesters(
                    let manager = manager.clone();
                    let barrier = barrier.clone();
                    let work_rx = work_rx.clone();
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        let handle = rt.handle();
-                        loop {
-                            // queue up and wait if we want to go another round
-                            if work_rx.lock().unwrap().recv().is_err() {
-                                break;
-                            }
-
-                            let input = Some(input_factory());
-
-                            barrier.wait();
-
-                            execute_all(input, handle, &manager).unwrap();
-
-                            barrier.wait();
+                    move || loop {
+                        // queue up and wait if we want to go another round
+                        if work_rx.lock().unwrap().recv().is_err() {
+                            break;
                        }
+
+                        let input = Some(input_factory());
+
+                        barrier.wait();
+
+                        execute_all(input, &manager).unwrap();
+
+                        barrier.wait();
                    }
                })
            })
@@ -167,17 +149,13 @@ impl Drop for JoinOnDrop {
    }
 }

-fn execute_all<I>(
-    input: I,
-    handle: &tokio::runtime::Handle,
-    manager: &PostgresRedoManager,
-) -> anyhow::Result<()>
+fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> anyhow::Result<()>
 where
    I: IntoIterator<Item = Request>,
 {
    // just fire all requests as fast as possible
    input.into_iter().try_for_each(|req| {
-        let page = req.execute(handle, manager)?;
+        let page = req.execute(manager)?;
        assert_eq!(page.remaining(), 8192);
        anyhow::Ok(())
    })
@@ -492,11 +470,9 @@ struct Request {
 }

 impl Request {
-    fn execute(
-        self,
-        rt: &tokio::runtime::Handle,
-        manager: &PostgresRedoManager,
-    ) -> anyhow::Result<Bytes> {
+    fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
+        use pageserver::walredo::WalRedoManager;
+
        let Request {
            key,
            lsn,
@@ -505,6 +481,6 @@ impl Request {
            pg_version,
        } = self;

-        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
+        manager.request_redo(key, lsn, base_img, records, pg_version)
    }
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,6 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
-use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
 use tokio::io;
@@ -181,7 +180,6 @@ where
            }
        }

-        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
@@ -215,34 +213,6 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
-
-            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
-            }
-        }
-        if min_restart_lsn != Lsn::MAX {
-            info!(
-                "Min restart LSN for logical replication is {}",
-                min_restart_lsn
-            );
-            let data = min_restart_lsn.0.to_le_bytes();
-            let header = new_tar_header("restart.lsn", data.len() as u64)?;
-            self.ar
-                .append(&header, &data[..])
-                .await
-                .context("could not add restart.lsn file to basebackup tarball")?;
        }
        for xid in self
            .timeline
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,7 +2,6 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
@@ -201,51 +200,6 @@ fn initialize_config(
    })
 }

-struct WaitForPhaseResult<F: std::future::Future + Unpin> {
-    timeout_remaining: Duration,
-    skipped: Option<F>,
-}
-
-/// During startup, we apply a timeout to our waits for readiness, to avoid
-/// stalling the whole service if one Tenant experiences some problem.  Each
-/// phase may consume some of the timeout: this function returns the updated
-/// timeout for use in the next call.
-async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
-where
-    F: std::future::Future + Unpin,
-{
-    let initial_t = Instant::now();
-    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
-        Ok(_) => None,
-        Err(_) => {
-            tracing::info!(
-                timeout_millis = timeout.as_millis(),
-                %phase,
-                "Startup phase timed out, proceeding anyway"
-            );
-            Some(fut)
-        }
-    };
-
-    WaitForPhaseResult {
-        timeout_remaining: timeout
-            .checked_sub(Instant::now().duration_since(initial_t))
-            .unwrap_or(Duration::ZERO),
-        skipped,
-    }
-}
-
-fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
-    let elapsed = started_at.elapsed();
-    let secs = elapsed.as_secs_f64();
-    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-
-    info!(
-        elapsed_ms = elapsed.as_millis(),
-        "{human_phase} ({secs:.3}s since start)"
-    )
-}
-
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -253,6 +207,16 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -377,7 +341,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
+    startup_checkpoint("initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -424,93 +388,58 @@ fn start_pageserver(
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial load completed")
-            });
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

-            let timeout = conf.background_task_maximum_delay;
+            init_remote_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");

-            let init_remote_done = std::pin::pin!(async {
-                init_remote_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_tenant_load_remote",
-                    "Remote part of initial load completed",
-                );
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: timeout,
-                skipped: init_remote_skipped,
-            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
-
-            let init_load_done = std::pin::pin!(async {
-                init_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_tenant_load",
-                    "Initial load completed",
-                );
-                STARTUP_IS_LOADING.set(0);
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: timeout,
-                skipped: init_load_skipped,
-            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
+            init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial logical sizes completed")
-            });
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

-            let logical_sizes_done = std::pin::pin!(async {
-                init_logical_size_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_logical_sizes",
-                    "Initial logical sizes completed",
-                );
-            });
+            let timeout = conf.background_task_maximum_delay;

-            let WaitForPhaseResult {
-                timeout_remaining: _,
-                skipped: logical_sizes_skipped,
-            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
+                Ok(_) => {
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
+                    None
+                }
+                Err(_) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start: we either completed prior stages, or they reached timeout
-            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
-            // because things like consumption metrics for billing are blocked by this barrier.
+            // allow background jobs to start
            drop(background_jobs_can_start);
-            startup_checkpoint(
-                started_startup_at,
-                "background_jobs_can_start",
-                "Starting background jobs",
-            );
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

-            // We are done. If we skipped any phases due to timeout, run them to completion here so that
-            // they will eventually update their startup_checkpoint, and so that we do not declare the
-            // 'complete' stage until all the other stages are really done.
-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before waiting for skipped phases done")
-            });
-            if let Some(f) = init_remote_skipped {
-                f.await;
-            }
-            if let Some(f) = init_load_skipped {
-                f.await;
-            }
-            if let Some(f) = logical_sizes_skipped {
-                f.await;
-            }
-            scopeguard::ScopeGuard::into_inner(guard);
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;

-            startup_checkpoint(started_startup_at, "complete", "Startup complete");
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+
+            }
+
+            startup_checkpoint("complete", "Startup complete");
        };

        async move {
@@ -650,7 +579,6 @@ fn start_pageserver(
                    pageserver_listener,
                    conf.pg_auth_type,
                    libpq_ctx,
-                    task_mgr::shutdown_token(),
                )
                .await
            },
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -1298,6 +1298,10 @@ pub(crate) mod mock {
            }
        }

+        pub fn get_executed(&self) -> usize {
+            self.executed.load(Ordering::Relaxed)
+        }
+
        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
            if let Some(remote_storage) = &self.remote_storage {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -306,67 +306,6 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    get:
-      description: Get timestamp for a given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: integer
-          description: A LSN to get the timestamp
-      responses:
-        "200":
-          description: OK
-          content:
-            application/json:
-              schema:
-                type: string
-                format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,12 +2,10 @@
 //! Management HTTP API
 //!
 use std::collections::HashMap;
-use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
-use humantime::format_rfc3339;
 use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
@@ -504,33 +502,6 @@ async fn get_lsn_by_timestamp_handler(
    json_response(StatusCode::OK, result)
 }

-async fn get_timestamp_of_lsn_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-
-    let lsn_str = must_get_query_param(&request, "lsn")?;
-    let lsn = Lsn::from_str(&lsn_str)
-        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
-        .map_err(ApiError::BadRequest)?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
-
-    match result {
-        Some(time) => {
-            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
-            json_response(StatusCode::OK, time)
-        }
-        None => json_response(StatusCode::NOT_FOUND, ()),
-    }
-}
-
 async fn tenant_attach_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1063,17 +1034,9 @@ async fn put_tenant_location_config_handler(
    // The `Detached` state is special, it doesn't upsert a tenant, it removes
    // its local disk content and drops it from memory.
    if let LocationConfigMode::Detached = request_data.config.mode {
-        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
            .instrument(info_span!("tenant_detach", %tenant_id))
-            .await
-        {
-            match e {
-                TenantStateError::NotFound(_) => {
-                    // This API is idempotent: a NotFound on a detach is fine.
-                }
-                _ => return Err(e.into()),
-            }
-        }
+            .await?;
        return json_response(StatusCode::OK, ());
    }

@@ -1709,10 +1672,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
-            |r| api_handler(r, get_timestamp_of_lsn_handler),
-        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
            api_handler(r, timeline_gc_handler)
        })
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -318,6 +318,15 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    }
 }

+impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
+    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
+    }
+}
+
 impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
    #[must_use]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -122,7 +122,6 @@ pub async fn libpq_listener_main(
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
-    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    listener.set_nonblocking(true)?;
    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -131,7 +130,7 @@ pub async fn libpq_listener_main(
    while let Some(res) = tokio::select! {
        biased;

-        _ = cancel.cancelled() => {
+        _ = task_mgr::shutdown_watcher() => {
            // We were requested to shut down.
            None
        }
@@ -300,7 +299,7 @@ impl PageServerHandler {
                Ok(flush_r?)
            },
            _ = self.cancel.cancelled() => {
-                Err(QueryError::Shutdown)
+                Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
            }
        )
    }
@@ -317,11 +316,11 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;

-                    _ = self.cancel.cancelled() => {
+                    _ = task_mgr::shutdown_watcher() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
+                        Err(QueryError::Other(anyhow::anyhow!(msg)))
                    }

                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
@@ -415,10 +414,10 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = self.cancel.cancelled() => {
+                _ = task_mgr::shutdown_watcher() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
-                    return Err(QueryError::Shutdown)
+                    break;
                }

                msg = pgb.read_message() => { msg }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -19,7 +19,6 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
-use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
@@ -371,6 +370,7 @@ impl Timeline {
        }
    }

+    ///
    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
    ///
@@ -385,50 +385,6 @@ impl Timeline {
        found_larger: &mut bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
-            if timestamp >= search_timestamp {
-                *found_larger = true;
-                return ControlFlow::Break(true);
-            } else {
-                *found_smaller = true;
-            }
-            ControlFlow::Continue(())
-        })
-        .await
-    }
-
-    /// Obtain the possible timestamp range for the given lsn.
-    ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
-        &self,
-        probe_lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Option<TimestampTz>, PageReconstructError> {
-        let mut max: Option<TimestampTz> = None;
-        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
-            if let Some(max_prev) = max {
-                max = Some(max_prev.max(timestamp));
-            } else {
-                max = Some(timestamp);
-            }
-            ControlFlow::Continue(())
-        })
-        .await?;
-
-        Ok(max)
-    }
-
-    /// Runs the given function on all the timestamps for a given lsn
-    ///
-    /// The return value is either given by the closure, or set to the `Default`
-    /// impl's output.
-    async fn map_all_timestamps<T: Default>(
-        &self,
-        probe_lsn: Lsn,
-        ctx: &RequestContext,
-        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
-    ) -> Result<T, PageReconstructError> {
        for segno in self
            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
@@ -446,14 +402,16 @@ impl Timeline {
                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);

-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if timestamp >= search_timestamp {
+                        *found_larger = true;
+                        return Ok(true);
+                    } else {
+                        *found_smaller = true;
                    }
                }
            }
        }
-        Ok(Default::default())
+        Ok(false)
    }

    /// Get a list of SLRU segments
@@ -541,23 +499,6 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub async fn list_aux_files(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
-                Ok(dir) => Ok(dir.files),
-                Err(e) => Err(PageReconstructError::from(e)),
-            },
-            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
-                Ok(HashMap::new())
-            }
-        }
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -675,7 +616,6 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);

        Ok(result.to_keyspace())
    }
@@ -752,12 +692,6 @@ impl<'a> DatadirModification<'a> {
        })?;
        self.put(DBDIR_KEY, Value::Image(buf.into()));

-        // Create AuxFilesDirectory
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
@@ -862,12 +796,6 @@ impl<'a> DatadirModification<'a> {
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: HashMap::new(),
-            })?;
-            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1192,36 +1120,6 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub async fn put_file(
-        &mut self,
-        path: &str,
-        content: &[u8],
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                warn!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
-                }
-            }
-        };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
-        Ok(())
-    }
-
    ///
    /// Flush changes accumulated so far to the underlying repository.
    ///
@@ -1357,11 +1255,6 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

-#[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
    nblocks: u32,
@@ -1410,12 +1303,10 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 02 pg_twophase
 //
 // 03 misc
-//    Controlfile
+//    controlfile
 //    checkpoint
 //    pg_version
 //
-// 04 aux files
-//
 // Below is a full list of the keyspace allocation:
 //
 // DbDir:
@@ -1453,11 +1344,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 //
 // Checkpoint:
 // 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
 //-- Section 01: relation data and metadata

 const DBDIR_KEY: Key = Key {
@@ -1681,15 +1567,6 @@ const CHECKPOINT_KEY: Key = Key {
    field6: 1,
 };

-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -77,12 +77,12 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
-use crate::METADATA_FILE_NAME;

 use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
+use crate::walredo::WalRedoManager;
 use crate::TEMP_FILE_SUFFIX;
 pub use pageserver_api::models::TenantState;

@@ -229,7 +229,7 @@ pub struct Tenant {
    // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
    // timeout...
    gc_cs: tokio::sync::Mutex<()>,
-    walredo_mgr: Arc<WalRedoManager>,
+    walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,

    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: Option<GenericRemoteStorage>,
@@ -246,43 +246,67 @@ pub struct Tenant {
    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

-pub(crate) enum WalRedoManager {
-    Prod(PostgresRedoManager),
-    #[cfg(test)]
-    Test(harness::TestRedoManager),
-}
+// We should not blindly overwrite local metadata with remote one.
+// For example, consider the following case:
+//     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
+//     pageserver crashes. During startup we'll load new metadata, and then reset it
+//     to the state of remote one. But current layermap will have layers from the old
+//     metadata which is inconsistent.
+//     And with current logic it wont disgard them during load because during layermap
+//     load it sees local disk consistent lsn which is ahead of layer lsns.
+//     If we treat remote as source of truth we need to completely sync with it,
+//     i e delete local files which are missing on the remote. This will add extra work,
+//     wal for these layers needs to be reingested for example
+//
+// So the solution is to take remote metadata only when we're attaching.
+pub fn merge_local_remote_metadata<'a>(
+    local: Option<&'a TimelineMetadata>,
+    remote: Option<&'a TimelineMetadata>,
+) -> anyhow::Result<(&'a TimelineMetadata, bool)> {
+    match (local, remote) {
+        (None, None) => anyhow::bail!("we should have either local metadata or remote"),
+        (Some(local), None) => Ok((local, true)),
+        // happens if we crash during attach, before writing out the metadata file
+        (None, Some(remote)) => Ok((remote, false)),
+        // This is the regular case where we crash/exit before finishing queued uploads.
+        // Also, it happens if we crash during attach after writing the metadata file
+        // but before removing the attaching marker file.
+        (Some(local), Some(remote)) => {
+            let consistent_lsn_cmp = local
+                .disk_consistent_lsn()
+                .cmp(&remote.disk_consistent_lsn());
+            let gc_cutoff_lsn_cmp = local
+                .latest_gc_cutoff_lsn()
+                .cmp(&remote.latest_gc_cutoff_lsn());
+            use std::cmp::Ordering::*;
+            match (consistent_lsn_cmp, gc_cutoff_lsn_cmp) {
+                // It wouldn't matter, but pick the local one so that we don't rewrite the metadata file.
+                (Equal, Equal) => Ok((local, true)),
+                // Local state is clearly ahead of the remote.
+                (Greater, Greater) => Ok((local, true)),
+                // We have local layer files that aren't on the remote, but GC horizon is on par.
+                (Greater, Equal) => Ok((local, true)),
+                // Local GC started running but we couldn't sync it to the remote.
+                (Equal, Greater) => Ok((local, true)),

-impl From<PostgresRedoManager> for WalRedoManager {
-    fn from(mgr: PostgresRedoManager) -> Self {
-        Self::Prod(mgr)
-    }
-}
-
-#[cfg(test)]
-impl From<harness::TestRedoManager> for WalRedoManager {
-    fn from(mgr: harness::TestRedoManager) -> Self {
-        Self::Test(mgr)
-    }
-}
-
-impl WalRedoManager {
-    pub async fn request_redo(
-        &self,
-        key: crate::repository::Key,
-        lsn: Lsn,
-        base_img: Option<(Lsn, bytes::Bytes)>,
-        records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
-        pg_version: u32,
-    ) -> anyhow::Result<bytes::Bytes> {
-        match self {
-            Self::Prod(mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
-                    .await
-            }
-            #[cfg(test)]
-            Self::Test(mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
-                    .await
+                // We always update the local value first, so something else must have
+                // updated the remote value, probably a different pageserver.
+                // The control plane is supposed to prevent this from happening.
+                // Bail out.
+                (Less, Less)
+                | (Less, Equal)
+                | (Equal, Less)
+                | (Less, Greater)
+                | (Greater, Less) => {
+                    anyhow::bail!(
+                        r#"remote metadata appears to be ahead of local metadata:
+local:
+  {local:#?}
+remote:
+  {remote:#?}
+"#
+                    );
+                }
            }
        }
    }
@@ -351,6 +375,11 @@ impl Debug for SetStoppingError {
    }
 }

+struct RemoteStartupData {
+    index_part: IndexPart,
+    remote_metadata: TimelineMetadata,
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum WaitToBecomeActiveError {
    WillNotBecomeActive {
@@ -391,12 +420,6 @@ pub enum CreateTimelineError {
    Other(#[from] anyhow::Error),
 }

-/// spawn_attach argument for whether the caller is using attachment markers
-pub(super) enum AttachMarkerMode {
-    Expect,
-    Ignore,
-}
-
 struct TenantDirectoryScan {
    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
@@ -423,17 +446,24 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        resources: TimelineResources,
-        index_part: Option<IndexPart>,
-        metadata: TimelineMetadata,
+        remote_startup_data: Option<RemoteStartupData>,
+        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_id;

+        let (up_to_date_metadata, picked_local) = merge_local_remote_metadata(
+            local_metadata.as_ref(),
+            remote_startup_data.as_ref().map(|r| &r.remote_metadata),
+        )
+        .context("merge_local_remote_metadata")?
+        .to_owned();
+
        let timeline = self.create_timeline_struct(
            timeline_id,
-            &metadata,
+            up_to_date_metadata,
            ancestor.clone(),
            resources,
            init_order,
@@ -446,11 +476,20 @@ impl Tenant {
        );
        assert_eq!(
            disk_consistent_lsn,
-            metadata.disk_consistent_lsn(),
+            up_to_date_metadata.disk_consistent_lsn(),
            "these are used interchangeably"
        );

-        if let Some(index_part) = index_part.as_ref() {
+        // Save the metadata file to local disk.
+        if !picked_local {
+            save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
+                .await
+                .context("save_metadata")?;
+        }
+
+        let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
+
+        if let Some(index_part) = index_part {
            timeline
                .remote_client
                .as_ref()
@@ -463,12 +502,15 @@ impl Tenant {
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
        }

        timeline
-            .load_layer_map(disk_consistent_lsn, index_part)
+            .load_layer_map(
+                disk_consistent_lsn,
+                remote_startup_data.map(|x| x.index_part),
+            )
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -525,13 +567,10 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        expect_marker: AttachMarkerMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf, tenant_id,
-        )));
+        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));

        let TenantSharedResources {
            broker_client,
@@ -609,7 +648,7 @@ impl Tenant {
                    }
                }

-                match tenant_clone.attach(&ctx, expect_marker).await {
+                match tenant_clone.attach(&ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
@@ -634,23 +673,17 @@ impl Tenant {
    ///
    /// No background tasks are started as part of this routine.
    ///
-    async fn attach(
-        self: &Arc<Tenant>,
-        ctx: &RequestContext,
-        expect_marker: AttachMarkerMode,
-    ) -> anyhow::Result<()> {
+    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        if let AttachMarkerMode::Expect = expect_marker {
-            if !tokio::fs::try_exists(&marker_file)
-                .await
-                .context("check for existence of marker file")?
-            {
-                anyhow::bail!(
-                    "implementation error: marker file should exist at beginning of this function"
-                );
-            }
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
        }

        // Get list of remote timelines
@@ -772,12 +805,10 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }

-        if let AttachMarkerMode::Expect = expect_marker {
-            std::fs::remove_file(&marker_file)
-                .with_context(|| format!("unlink attach marker file {marker_file}"))?;
-            crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
-                .context("fsync tenant directory after unlinking attach marker file")?;
-        }
+        std::fs::remove_file(&marker_file)
+            .with_context(|| format!("unlink attach marker file {marker_file}"))?;
+        crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
+            .context("fsync tenant directory after unlinking attach marker file")?;

        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

@@ -830,23 +861,21 @@ impl Tenant {
            None
        };

-        // we can load remote timelines during init, but they are assumed to be so rare that
-        // initialization order is not passed to here.
-        let init_order = None;
-
-        // timeline loading after attach expects to find metadata file for each metadata
-        save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata)
-            .await
-            .context("save_metadata")
-            .map_err(LoadLocalTimelineError::Load)?;
+        // Even if there is local metadata it cannot be ahead of the remote one
+        // since we're attaching. Even if we resume interrupted attach remote one
+        // cannot be older than the local one
+        let local_metadata = None;

        self.timeline_init_and_sync(
            timeline_id,
            resources,
-            Some(index_part),
-            remote_metadata,
+            Some(RemoteStartupData {
+                index_part,
+                remote_metadata,
+            }),
+            local_metadata,
            ancestor,
-            init_order,
+            None,
            ctx,
        )
        .await
@@ -858,9 +887,7 @@ impl Tenant {
        tenant_id: TenantId,
        reason: String,
    ) -> Arc<Tenant> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf, tenant_id,
-        )));
+        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        Arc::new(Tenant::new(
            TenantState::Broken {
                reason,
@@ -899,9 +926,7 @@ impl Tenant {
        let broker_client = resources.broker_client;
        let remote_storage = resources.remote_storage;

-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf, tenant_id,
-        )));
+        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Tenant::new(
            TenantState::Loading,
            conf,
@@ -1324,8 +1349,8 @@ impl Tenant {
                            LoadLocalTimelineError::Load(source) => {
                                // We tried to load deleted timeline, this is a bug.
                                return Err(anyhow::anyhow!(source).context(
-                                    format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}")
-                                ));
+                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
+                            ));
                            }
                            LoadLocalTimelineError::ResumeDeletion(source) => {
                                // Make sure resumed deletion wont fail loading for entire tenant.
@@ -1359,11 +1384,6 @@ impl Tenant {

        let mut resources = self.build_timeline_resources(timeline_id);

-        struct RemoteStartupData {
-            index_part: IndexPart,
-            remote_metadata: TimelineMetadata,
-        }
-
        let (remote_startup_data, remote_client) = match preload {
            Some(preload) => {
                let TimelinePreload {
@@ -1419,7 +1439,7 @@ impl Tenant {
                        )
                    }
                    Err(DownloadError::NotFound) => {
-                        info!(found_delete_mark, "no index file was found on the remote, resuming deletion or cleaning unuploaded up");
+                        info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");

                        if found_delete_mark {
                            // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
@@ -1433,73 +1453,14 @@ impl Tenant {
                            .map_err(LoadLocalTimelineError::ResumeDeletion);
                        }

-                        // as the remote index_part.json did not exist, this timeline is a
-                        // not-yet-uploaded one. it should be deleted now, because the branching might
-                        // not have been valid as it's ancestor may have been restored to earlier state
-                        // as well. in practice, control plane will keep retrying.
-                        //
-                        // first ensure that the un-uploaded timeline looks like it should, as in we
-                        // are not accidentially deleting a timeline which was ever active:
-                        // - root timelines have metadata and one possibly partial layer
-                        // - branched timelines have metadata
-                        //
-                        // if the timeline does not look like expected, fail loading of the tenant.
-                        // cleaning the timeline up manually and reloading the tenant is possible via
-                        // the above log message.
-                        let path = self.conf.timeline_path(&self.tenant_id, &timeline_id);
-
-                        let span = tracing::Span::current();
-
-                        return tokio::task::spawn_blocking({
-                        move || {
-                            use std::str::FromStr;
-                            use crate::tenant::storage_layer::LayerFileName;
-
-                            let _e = span.entered();
-                            let mut metadata = false;
-                            let mut layers = 0;
-                            let mut others = 0;
-                            for dentry in path.read_dir_utf8()? {
-                                let dentry = dentry?;
-                                let file_name = dentry.file_name();
-
-                                if file_name == METADATA_FILE_NAME {
-                                    metadata = true;
-                                    continue;
-                                }
-
-                                if LayerFileName::from_str(file_name).is_ok()
-                                {
-                                    layers += 1;
-                                    continue;
-                                }
-
-                                others += 1;
-                            }
-
-                            // bootstrapped have the one image layer file, or one partial temp
-                            // file, branched have just the metadata
-                            if !(metadata && layers + others <= 1) {
-                                anyhow::bail!("unexpected assumed unuploaded, never been active timeline: found metadata={}, layers={}, others={}", metadata, layers, others);
-                            }
-
-                            let tmp_path =
-                                path.with_file_name(format!("{timeline_id}{}", TEMP_FILE_SUFFIX));
-                            std::fs::rename(path, &tmp_path)?;
-                            std::fs::remove_dir_all(&tmp_path)?;
-                            Ok(())
-                        }
-                    })
-                    .await
-                    .map_err(anyhow::Error::new)
-                    .and_then(|x| x)
-                    .context("delete assumed unuploaded fresh timeline")
-                    .map_err(LoadLocalTimelineError::Load);
+                        // We're loading fresh timeline that didnt yet make it into remote.
+                        (None, Some(remote_client))
                    }
                    Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
                }
            }
            None => {
+                // No remote client
                if found_delete_mark {
                    // There is no remote client, we found local metadata.
                    // Continue cleaning up local disk.
@@ -1531,27 +1492,11 @@ impl Tenant {
            None
        };

-        let (index_part, metadata) = match remote_startup_data {
-            Some(RemoteStartupData {
-                index_part,
-                remote_metadata,
-            }) => {
-                // always choose the remote metadata to be crash consistent (see RFC 27)
-                save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata)
-                    .await
-                    .context("save_metadata")
-                    .map_err(LoadLocalTimelineError::Load)?;
-
-                (Some(index_part), remote_metadata)
-            }
-            None => (None, local_metadata),
-        };
-
        self.timeline_init_and_sync(
            timeline_id,
            resources,
-            index_part,
-            metadata,
+            remote_startup_data,
+            Some(local_metadata),
            ancestor,
            init_order,
            ctx,
@@ -2466,7 +2411,7 @@ impl Tenant {
        state: TenantState,
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
-        walredo_mgr: Arc<WalRedoManager>,
+        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        tenant_id: TenantId,
        remote_storage: Option<GenericRemoteStorage>,
        deletion_queue_client: DeletionQueueClient,
@@ -3597,7 +3542,7 @@ pub async fn dump_layerfile_from_path(
 }

 #[cfg(test)]
-pub(crate) mod harness {
+pub mod harness {
    use bytes::{Bytes, BytesMut};
    use once_cell::sync::OnceCell;
    use std::fs;
@@ -3608,6 +3553,7 @@ pub(crate) mod harness {
    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::{
        config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
+        walredo::WalRedoManager,
    };

    use super::*;
@@ -3736,7 +3682,7 @@ pub(crate) mod harness {
        }

        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
-            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
+            let walredo_mgr = Arc::new(TestRedoManager);

            let tenant = Arc::new(Tenant::new(
                TenantState::Loading,
@@ -3770,10 +3716,10 @@ pub(crate) mod harness {
    }

    // Mock WAL redo manager that doesn't do much
-    pub(crate) struct TestRedoManager;
+    pub struct TestRedoManager;

-    impl TestRedoManager {
-        pub async fn request_redo(
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
            &self,
            key: Key,
            lsn: Lsn,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -458,10 +458,7 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant
-            .attach(ctx, super::AttachMarkerMode::Expect)
-            .await
-            .context("attach")?;
+        tenant.attach(ctx).await.context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -354,7 +354,8 @@ mod tests {
        }

        // Test a large blob that spans multiple pages
-        let mut large_data = vec![0; 20000];
+        let mut large_data = Vec::new();
+        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data, &ctx).await?;
        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -27,8 +27,7 @@ use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{
-    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
-    TenantState,
+    create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
 };
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -152,49 +151,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U

 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));

-/// Create a directory, including parents.  This does no fsyncs and makes
-/// no guarantees about the persistence of the resulting metadata: for
-/// use when creating dirs for use as cache.
-async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
-    let mut dirs_to_create = Vec::new();
-    let mut path: &Utf8Path = path.as_ref();
-
-    // Figure out which directories we need to create.
-    loop {
-        let meta = tokio::fs::metadata(path).await;
-        match meta {
-            Ok(metadata) if metadata.is_dir() => break,
-            Ok(_) => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {path}"),
-                ));
-            }
-            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(e),
-        }
-
-        dirs_to_create.push(path);
-
-        match path.parent() {
-            Some(parent) => path = parent,
-            None => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{path}'"),
-                ));
-            }
-        }
-    }
-
-    // Create directories from parent to child.
-    for &path in dirs_to_create.iter().rev() {
-        tokio::fs::create_dir(path).await?;
-    }
-
-    Ok(())
-}
-
 fn emergency_generations(
    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -490,15 +446,7 @@ pub(crate) fn schedule_local_tenant_processing(
                "attaching mark file present but no remote storage configured".to_string(),
            )
        } else {
-            match Tenant::spawn_attach(
-                conf,
-                tenant_id,
-                resources,
-                location_conf,
-                tenants,
-                AttachMarkerMode::Expect,
-                ctx,
-            ) {
+            match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -707,7 +655,7 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-#[instrument(skip_all, fields(%tenant_id))]
+#[instrument(skip_all, fields(tenant_id, new_location_config))]
 pub(crate) async fn upsert_location(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
@@ -786,61 +734,36 @@ pub(crate) async fn upsert_location(
            }

            let new_slot = match &new_location_config.mode {
-                LocationMode::Secondary(_) => {
-                    let tenant_path = conf.tenant_path(&tenant_id);
-                    // Directory doesn't need to be fsync'd because if we crash it can
-                    // safely be recreated next time this tenant location is configured.
-                    unsafe_create_dir_all(&tenant_path)
-                        .await
-                        .with_context(|| format!("Creating {tenant_path}"))?;
-
-                    Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                        .await
-                        .map_err(SetNewTenantConfigError::Persist)?;
-
-                    TenantSlot::Secondary
-                }
+                LocationMode::Secondary(_) => TenantSlot::Secondary,
                LocationMode::Attached(_attach_config) => {
+                    // Do a schedule_local_tenant_processing
                    // FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
                    // we have the same problem in load_tenant/attach_tenant.  Probably
                    // need a lock in TenantSlot to fix this.
-                    let timelines_path = conf.timelines_path(&tenant_id);
-
-                    // Directory doesn't need to be fsync'd because we do not depend on
-                    // it to exist after crashes: it may be recreated when tenant is
-                    // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                    unsafe_create_dir_all(&timelines_path)
-                        .await
-                        .with_context(|| format!("Creating {timelines_path}"))?;
-
                    Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;
-
-                    let tenant = match Tenant::spawn_attach(
+                    let tenant_path = conf.tenant_path(&tenant_id);
+                    let resources = TenantSharedResources {
+                        broker_client,
+                        remote_storage,
+                        deletion_queue_client,
+                    };
+                    let new_tenant = schedule_local_tenant_processing(
                        conf,
                        tenant_id,
-                        TenantSharedResources {
-                            broker_client,
-                            remote_storage,
-                            deletion_queue_client,
-                        },
+                        &tenant_path,
                        AttachedTenantConf::try_from(new_location_config)?,
+                        resources,
+                        None,
                        &TENANTS,
-                        // The LocationConf API does not use marker files, because we have Secondary
-                        // locations where the directory's existence is not a signal that it contains
-                        // all timelines.  See https://github.com/neondatabase/neon/issues/5550
-                        AttachMarkerMode::Ignore,
                        ctx,
-                    ) {
-                        Ok(tenant) => tenant,
-                        Err(e) => {
-                            error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
-                            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
-                        }
-                    };
+                    )
+                    .with_context(|| {
+                        format!("Failed to schedule tenant processing in path {tenant_path:?}")
+                    })?;

-                    TenantSlot::Attached(tenant)
+                    TenantSlot::Attached(new_tenant)
                }
            };

@@ -848,6 +771,7 @@ pub(crate) async fn upsert_location(
        })
        .await?;
    }
+
    Ok(())
 }

--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -226,14 +226,6 @@ impl LayerFileName {
            _ => false,
        }
    }
-
-    pub(crate) fn kind(&self) -> &'static str {
-        use LayerFileName::*;
-        match self {
-            Delta(_) => "delta",
-            Image(_) => "image",
-        }
-    }
 }

 impl fmt::Display for LayerFileName {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -25,7 +25,7 @@ use super::{
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`].
+/// [`DeltaLayer`](super::DeltaLayer).
 ///
 /// RemoteLayer might be downloaded on-demand during operations which are
 /// allowed download remote layers and during which, it gets replaced with a
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -81,6 +81,7 @@ use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::walredo::WalRedoManager;
 use crate::ZERO_PAGE;

 use self::delete::DeleteTimelineFlow;
@@ -200,7 +201,7 @@ pub struct Timeline {
    last_freeze_ts: RwLock<Instant>,

    // WAL redo manager
-    walredo_mgr: Arc<super::WalRedoManager>,
+    walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,

    /// Remote storage client.
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
@@ -1470,7 +1471,7 @@ impl Timeline {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        generation: Generation,
-        walredo_mgr: Arc<super::WalRedoManager>,
+        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        resources: TimelineResources,
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
@@ -1699,7 +1700,7 @@ impl Timeline {
        disk_consistent_lsn: Lsn,
        index_part: Option<IndexPart>,
    ) -> anyhow::Result<()> {
-        use init::{Decision::*, Discovered, DismissedLayer};
+        use init::{Decision::*, Discovered, FutureLayer};
        use LayerFileName::*;

        let mut guard = self.layers.write().await;
@@ -1715,7 +1716,7 @@ impl Timeline {
        // Copy to move into the task we're about to spawn
        let generation = self.generation;

-        let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
+        let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
            move || {
                let _g = span.entered();
                let discovered = init::scan_timeline_dir(&timeline_path)?;
@@ -1764,6 +1765,7 @@ impl Timeline {
                );

                let mut loaded_layers = Vec::new();
+                let mut needs_upload = Vec::new();
                let mut needs_cleanup = Vec::new();
                let mut total_physical_size = 0;

@@ -1784,7 +1786,7 @@ impl Timeline {
                            }
                        }
                        Ok(decision) => decision,
-                        Err(DismissedLayer::Future { local }) => {
+                        Err(FutureLayer { local }) => {
                            if local.is_some() {
                                path.push(name.file_name());
                                init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
@@ -1793,13 +1795,6 @@ impl Timeline {
                            needs_cleanup.push(name);
                            continue;
                        }
-                        Err(DismissedLayer::LocalOnly(local)) => {
-                            path.push(name.file_name());
-                            init::cleanup_local_only_file(&path, &name, &local)?;
-                            path.pop();
-                            // this file never existed remotely, we will have to do rework
-                            continue;
-                        }
                    };

                    match &name {
@@ -1808,16 +1803,14 @@ impl Timeline {
                    }

                    let status = match &decision {
-                        UseLocal(_) => LayerResidenceStatus::Resident,
+                        UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
                        Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
                    };

-                    tracing::debug!(layer=%name, ?decision, ?status, "applied");
-
                    let stats = LayerAccessStats::for_loading_layer(status);

                    let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
-                        (Delta(d), UseLocal(m)) => {
+                        (Delta(d), UseLocal(m) | NeedsUpload(m)) => {
                            total_physical_size += m.file_size();
                            Arc::new(DeltaLayer::new(
                                conf,
@@ -1828,7 +1821,7 @@ impl Timeline {
                                stats,
                            ))
                        }
-                        (Image(i), UseLocal(m)) => {
+                        (Image(i), UseLocal(m) | NeedsUpload(m)) => {
                            total_physical_size += m.file_size();
                            Arc::new(ImageLayer::new(
                                conf,
@@ -1847,9 +1840,17 @@ impl Timeline {
                        ),
                    };

+                    if let NeedsUpload(m) = decision {
+                        needs_upload.push((layer.clone(), m));
+                    }
+
                    loaded_layers.push(layer);
                }
-                Ok((loaded_layers, needs_cleanup, total_physical_size))
+                Ok((
+                    loaded_layers,
+                    (needs_upload, needs_cleanup),
+                    total_physical_size,
+                ))
            }
        })
        .await
@@ -1861,6 +1862,10 @@ impl Timeline {
        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);

        if let Some(rtc) = self.remote_client.as_ref() {
+            let (needs_upload, needs_cleanup) = to_sync;
+            for (layer, m) in needs_upload {
+                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
+            }
            rtc.schedule_layer_file_deletion(needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
@@ -4323,7 +4328,6 @@ impl Timeline {
                let img = match self
                    .walredo_mgr
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await
                    .context("Failed to reconstruct a page image:")
                {
                    Ok(img) => img,
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -72,7 +72,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 }

 /// Decision on what to do with a layer file after considering its local and remote metadata.
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
@@ -84,30 +84,27 @@ pub(super) enum Decision {
    },
    /// The layer is present locally, and metadata matches.
    UseLocal(LayerFileMetadata),
+    /// The layer is only known locally, it needs to be uploaded.
+    NeedsUpload(LayerFileMetadata),
 }

-/// A layer needs to be left out of the layer map.
+/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
 #[derive(Debug)]
-pub(super) enum DismissedLayer {
-    /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
-    Future {
-        /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-        local: Option<LayerFileMetadata>,
-    },
-    /// The layer only exists locally.
-    ///
-    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
-    /// found locally or not yet included in the remote `index_part.json`.
-    LocalOnly(LayerFileMetadata),
+pub(super) struct FutureLayer {
+    /// The local metadata. `None` if the layer is only known through [`IndexPart`].
+    pub(super) local: Option<LayerFileMetadata>,
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
+///
+/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
+/// the checks earlier to [`scan_timeline_dir`].
 pub(super) fn reconcile(
    discovered: Vec<(LayerFileName, u64)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
    generation: Generation,
-) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
+) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
    use Decision::*;

    // name => (local, remote)
@@ -145,19 +142,17 @@ pub(super) fn reconcile(
        .into_iter()
        .map(|(name, (local, remote))| {
            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(DismissedLayer::Future { local })
+                Err(FutureLayer { local })
            } else {
-                match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
-                        Ok(UseRemote { local, remote })
-                    }
-                    (Some(x), Some(_)) => Ok(UseLocal(x)),
-                    (None, Some(x)) => Ok(Evicted(x)),
-                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
+                Ok(match (local, remote) {
+                    (Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
+                    (Some(x), Some(_)) => UseLocal(x),
+                    (None, Some(x)) => Evicted(x),
+                    (Some(x), None) => NeedsUpload(x),
                    (None, None) => {
                        unreachable!("there must not be any non-local non-remote files")
                    }
-                }
+                })
            };

            (name, decision)
@@ -197,21 +192,14 @@ pub(super) fn cleanup_future_layer(
    name: &LayerFileName,
    disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
+    use LayerFileName::*;
+    let kind = match name {
+        Delta(_) => "delta",
+        Image(_) => "image",
+    };
    // future image layers are allowed to be produced always for not yet flushed to disk
    // lsns stored in InMemoryLayer.
-    let kind = name.kind();
    tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
-    std::fs::remove_file(path)?;
-    Ok(())
-}
-
-pub(super) fn cleanup_local_only_file(
-    path: &Utf8Path,
-    name: &LayerFileName,
-    local: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let kind = name.kind();
-    tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
-    std::fs::remove_file(path)?;
+    crate::tenant::timeline::rename_to_backup(path)?;
    Ok(())
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -338,20 +338,11 @@ impl<'a> WalIngest<'a> {
        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                let xlrec = XlLogicalMessage::decode(&mut buf);
-                let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
-                let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
-                if prefix == "neon-test" {
-                    // This is a convenient way to make the WAL ingestion pause at
-                    // particular point in the WAL. For more fine-grained control,
-                    // we could peek into the message and only pause if it contains
-                    // a particular string, for example, but this is enough for now.
-                    crate::failpoint_support::sleep_millis_async!(
-                        "wal-ingest-logical-message-sleep"
-                    );
-                } else if let Some(path) = prefix.strip_prefix("neon-file:") {
-                    modification.put_file(path, message, ctx).await?;
-                }
+                // This is a convenient way to make the WAL ingestion pause at
+                // particular point in the WAL. For more fine-grained control,
+                // we could peek into the message and only pause if it contains
+                // a particular string, for example, but this is enough for now.
+                crate::failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
            }
        }

@@ -468,6 +459,7 @@ impl<'a> WalIngest<'a> {
                        }
                    } else if info == pg_constants::XLOG_HEAP_DELETE {
                        let xlrec = v14::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
@@ -535,6 +527,7 @@ impl<'a> WalIngest<'a> {
                        }
                    } else if info == pg_constants::XLOG_HEAP_DELETE {
                        let xlrec = v15::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
@@ -602,6 +595,7 @@ impl<'a> WalIngest<'a> {
                        }
                    } else if info == pg_constants::XLOG_HEAP_DELETE {
                        let xlrec = v16::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
@@ -777,6 +771,7 @@ impl<'a> WalIngest<'a> {
                    }
                    pg_constants::XLOG_NEON_HEAP_DELETE => {
                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -748,26 +748,6 @@ impl XlMultiXactTruncate {
    }
 }

-#[repr(C)]
-#[derive(Debug)]
-pub struct XlLogicalMessage {
-    pub db_id: Oid,
-    pub transactional: bool,
-    pub prefix_size: usize,
-    pub message_size: usize,
-}
-
-impl XlLogicalMessage {
-    pub fn decode(buf: &mut Bytes) -> XlLogicalMessage {
-        XlLogicalMessage {
-            db_id: buf.get_u32_le(),
-            transactional: buf.get_u32_le() != 0, // 4-bytes alignment
-            prefix_size: buf.get_u64_le() as usize,
-            message_size: buf.get_u64_le() as usize,
-        }
-    }
-}
-
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -26,12 +26,13 @@ use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
 use std::io::prelude::*;
+use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
 use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock};
+use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
@@ -70,7 +71,29 @@ pub(crate) struct BufferTag {
    pub blknum: u32,
 }

+///
+/// WAL Redo Manager is responsible for replaying WAL records.
+///
+/// Callers use the WAL redo manager through this abstract interface,
+/// which makes it easy to mock it in tests.
+pub trait WalRedoManager: Send + Sync {
+    /// Apply some WAL records.
+    ///
+    /// The caller passes an old page image, and WAL records that should be
+    /// applied over it. The return value is a new page image, after applying
+    /// the reords.
+    fn request_redo(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        base_img: Option<(Lsn, Bytes)>,
+        records: Vec<(Lsn, NeonWalRecord)>,
+        pg_version: u32,
+    ) -> anyhow::Result<Bytes>;
+}
+
 struct ProcessInput {
+    child: NoLeakChild,
    stdin: ChildStdin,
    stderr_fd: RawFd,
    stdout_fd: RawFd,
@@ -93,7 +116,13 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,
-    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+
+    stdout: Mutex<Option<ProcessOutput>>,
+    stdin: Mutex<Option<ProcessInput>>,
+    stderr: Mutex<Option<ChildStderr>>,
 }

 /// Can this request be served by neon redo functions
@@ -114,14 +143,14 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
 ///
 /// Public interface of WAL redo manager
 ///
-impl PostgresRedoManager {
+impl WalRedoManager for PostgresRedoManager {
    ///
    /// Request the WAL redo manager to apply some WAL records
    ///
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
-    pub async fn request_redo(
+    fn request_redo(
        &self,
        key: Key,
        lsn: Lsn,
@@ -186,7 +215,11 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
-            redo_process: RwLock::new(None),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+            stdin: Mutex::new(None),
+            stdout: Mutex::new(None),
+            stderr: Mutex::new(None),
        }
    }

@@ -209,38 +242,20 @@ impl PostgresRedoManager {
        let start_time = Instant::now();
        let mut n_attempts = 0u32;
        loop {
+            let mut proc = self.stdin.lock().unwrap();
            let lock_time = Instant::now();

            // launch the WAL redo process on first use
-            let proc: Arc<WalRedoProcess> = {
-                let proc_guard = self.redo_process.read().unwrap();
-                match &*proc_guard {
-                    None => {
-                        // "upgrade" to write lock to launch the process
-                        drop(proc_guard);
-                        let mut proc_guard = self.redo_process.write().unwrap();
-                        match &*proc_guard {
-                            None => {
-                                let proc = Arc::new(
-                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
-                                        .context("launch walredo process")?,
-                                );
-                                *proc_guard = Some(Arc::clone(&proc));
-                                proc
-                            }
-                            Some(proc) => Arc::clone(proc),
-                        }
-                    }
-                    Some(proc) => Arc::clone(proc),
-                }
-            };
-
+            if proc.is_none() {
+                self.launch(&mut proc, pg_version)
+                    .context("launch process")?;
+            }
            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
-            let result = proc
-                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
+            let result = self
+                .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

            let end_time = Instant::now();
@@ -281,34 +296,22 @@ impl PostgresRedoManager {
                    n_attempts,
                    e,
                );
-                // Avoid concurrent callers hitting the same issue.
-                // We can't prevent it from happening because we want to enable parallelism.
-                let mut guard = self.redo_process.write().unwrap();
-                match &*guard {
-                    Some(current_field_value) => {
-                        if Arc::ptr_eq(current_field_value, &proc) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            *guard = None;
-                        }
-                    }
-                    None => {
-                        // Another thread was faster to observe the error, and already took the process out of rotation.
-                    }
+                // self.stdin only holds stdin & stderr as_raw_fd().
+                // Dropping it as part of take() doesn't close them.
+                // The owning objects (ChildStdout and ChildStderr) are stored in
+                // self.stdout and self.stderr, respsectively.
+                // We intentionally keep them open here to avoid a race between
+                // currently running `apply_wal_records()` and a `launch()` call
+                // after we return here.
+                // The currently running `apply_wal_records()` must not read from
+                // the newly launched process.
+                // By keeping self.stdout and self.stderr open here, `launch()` will
+                // get other file descriptors for the new child's stdout and stderr,
+                // and hence the current `apply_wal_records()` calls will observe
+                //  `output.stdout.as_raw_fd() != stdout_fd` .
+                if let Some(proc) = self.stdin.lock().unwrap().take() {
+                    proc.child.kill_and_wait();
                }
-                drop(guard);
-                // NB: there may still be other concurrent threads using `proc`.
-                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
-                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
-                // holding the lock while waiting for the process to exit.
-                // NB: the drop impl blocks the current threads with a wait() system call for
-                // the child process. We dropped the `guard` above so that other threads aren't
-                // affected. But, it's good that the current thread _does_ block to wait.
-                // If we instead deferred the waiting into the background / to tokio, it could
-                // happen that if walredo always fails immediately, we spawn processes faster
-                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
-                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
-                // This probably needs revisiting at some later point.
-                drop(proc);
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -611,32 +614,24 @@ impl<C: CommandExt> CloseFileDescriptors for C {
    }
 }

-struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    stderr: Mutex<ChildStderr>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-impl WalRedoProcess {
+impl PostgresRedoManager {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
    fn launch(
-        conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        &self,
+        input: &mut MutexGuard<Option<ProcessInput>>,
        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+    ) -> Result<(), Error> {
+        let pg_bin_dir_path = self
+            .conf
+            .pg_bin_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
+        let pg_lib_dir_path = self
+            .conf
+            .pg_lib_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;

        // Start postgres itself
        let child = Command::new(pg_bin_dir_path.join("postgres"))
@@ -657,8 +652,13 @@ impl WalRedoProcess {
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
            .close_fds()
-            .spawn_no_leak_child(tenant_id)
-            .context("spawn process")?;
+            .spawn_no_leak_child(self.tenant_id)
+            .map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("postgres --wal-redo command failed to start: {}", e),
+                )
+            })?;

        let mut child = scopeguard::guard(child, |child| {
            error!("killing wal-redo-postgres process due to a problem during launch");
@@ -685,47 +685,36 @@ impl WalRedoProcess {
        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

-        Ok(Self {
-            conf,
-            tenant_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdout_fd: stdout.as_raw_fd(),
-                stderr_fd: stderr.as_raw_fd(),
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            stderr: Mutex::new(stderr),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
+        **input = Some(ProcessInput {
+            child,
+            stdout_fd: stdout.as_raw_fd(),
+            stderr_fd: stderr.as_raw_fd(),
+            stdin,
+            n_requests: 0,
+        });

-    fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
+        *self.stdout.lock().unwrap() = Some(ProcessOutput {
+            stdout,
+            pending_responses: VecDeque::new(),
+            n_processed_responses: 0,
+        });
+        *self.stderr.lock().unwrap() = Some(stderr);
+
+        Ok(())
    }

    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
    fn apply_wal_records(
        &self,
+        input: MutexGuard<Option<ProcessInput>>,
        tag: BufferTag,
        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        let input = self.stdin.lock().unwrap();
-
        // Serialize all the messages to send the WAL redo process first.
        //
        // This could be problematic if there are millions of records to replay,
@@ -768,17 +757,18 @@ impl WalRedoProcess {
    fn apply_wal_records0(
        &self,
        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
+        mut input: MutexGuard<Option<ProcessInput>>,
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let proc = input.as_mut().unwrap();
        let mut nwrite = 0usize;
+        let stdout_fd = proc.stdout_fd;

        // Prepare for calling poll()
        let mut pollfds = [
            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
-            PollFd::new(proc.stdout_fd, PollFlags::POLLIN),
+            PollFd::new(stdout_fd, PollFlags::POLLIN),
        ];

        // We do two things simultaneously: send the old base image and WAL records to
@@ -800,7 +790,8 @@ impl WalRedoProcess {
            let err_revents = pollfds[1].revents().unwrap();
            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                let mut errbuf: [u8; 16384] = [0; 16384];
-                let mut stderr = self.stderr.lock().unwrap();
+                let mut stderr_guard = self.stderr.lock().unwrap();
+                let stderr = stderr_guard.as_mut().unwrap();
                let len = stderr.read(&mut errbuf)?;

                // The message might not be split correctly into lines here. But this is
@@ -830,7 +821,7 @@ impl WalRedoProcess {
        }
        let request_no = proc.n_requests;
        proc.n_requests += 1;
-        drop(proc);
+        drop(input);

        // To improve walredo performance we separate sending requests and receiving
        // responses. Them are protected by different mutexes (output and input).
@@ -844,7 +835,20 @@ impl WalRedoProcess {
        // pending responses ring buffer and truncate all empty elements from the front,
        // advancing processed responses number.

-        let mut output = self.stdout.lock().unwrap();
+        let mut output_guard = self.stdout.lock().unwrap();
+        let output = output_guard.as_mut().unwrap();
+        if output.stdout.as_raw_fd() != stdout_fd {
+            // If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
+            // As far as ProcessInput and ProcessOutout are protected by different mutexes,
+            // it can happen that we send request to one process and waiting response from another.
+            // To prevent such situation we compare stdout file descriptors.
+            // As far as old stdout pipe is destroyed only after new one is created,
+            // it can not reuse the same file descriptor, so this check is safe.
+            //
+            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
+            // That's where we kill the child process.
+            anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+        }
        let n_processed_responses = output.n_processed_responses;
        while n_processed_responses + output.pending_responses.len() <= request_no {
            // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -869,7 +873,8 @@ impl WalRedoProcess {
                let err_revents = pollfds[1].revents().unwrap();
                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    let mut errbuf: [u8; 16384] = [0; 16384];
-                    let mut stderr = self.stderr.lock().unwrap();
+                    let mut stderr_guard = self.stderr.lock().unwrap();
+                    let stderr = stderr_guard.as_mut().unwrap();
                    let len = stderr.read(&mut errbuf)?;

                    // The message might not be split correctly into lines here. But this is
@@ -979,15 +984,6 @@ impl WalRedoProcess {
    fn record_and_log(&self, _: &[u8]) {}
 }

-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait();
-    }
-}
-
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
@@ -1135,15 +1131,15 @@ fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {

 #[cfg(test)]
 mod tests {
-    use super::PostgresRedoManager;
+    use super::{PostgresRedoManager, WalRedoManager};
    use crate::repository::Key;
    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
    use bytes::Bytes;
    use std::str::FromStr;
    use utils::{id::TenantId, lsn::Lsn};

-    #[tokio::test]
-    async fn short_v14_redo() {
+    #[test]
+    fn short_v14_redo() {
        let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap();

        let h = RedoHarness::new().unwrap();
@@ -1164,14 +1160,13 @@ mod tests {
                short_records(),
                14,
            )
-            .await
            .unwrap();

        assert_eq!(&expected, &*page);
    }

-    #[tokio::test]
-    async fn short_v14_fails_for_wrong_key_but_returns_zero_page() {
+    #[test]
+    fn short_v14_fails_for_wrong_key_but_returns_zero_page() {
        let h = RedoHarness::new().unwrap();

        let page = h
@@ -1191,7 +1186,6 @@ mod tests {
                short_records(),
                14,
            )
-            .await
            .unwrap();

        // TODO: there will be some stderr printout, which is forwarded to tracing that could
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,7 +9,6 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
-	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
@@ -24,34 +23,6 @@ EXTENSION = neon
 DATA = neon--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

-EXTRA_CLEAN = \
-	libwalproposer.a
-
-WALPROP_OBJS = \
-	$(WIN32RES) \
-	walproposer.o \
-	neon_utils.o \
-	walproposer_compat.o
-
-.PHONY: walproposer-lib
-walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
-walproposer-lib: libwalproposer.a;
-
-.PHONY: libwalproposer.a
-libwalproposer.a: $(WALPROP_OBJS)
-	rm -f $@
-	$(AR) $(AROPT) $@ $^
-
-# needs vars:
-# FIND_TYPEDEF pointing to find_typedef
-# INDENT pointing to pg_bsd_indent
-# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
-#   pgindent will pick it up as pg_bsd_indent path).
-.PHONY: pgindent
-pgindent:
-	+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
-	$(FIND_TYPEDEF) . > neon.typedefs
-	INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h

 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;

 /* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
+static CURL * CurlHandle;
 static struct curl_slist *ContentHeader = NULL;

 /*
@@ -54,7 +54,7 @@ typedef enum
 {
 	Op_Set,						/* An upsert: Either a creation or an alter */
 	Op_Delete,
-} OpType;
+}			OpType;

 typedef struct
 {
@@ -62,7 +62,7 @@ typedef struct
 	Oid			owner;
 	char		old_name[NAMEDATALEN];
 	OpType		type;
-} DbEntry;
+}			DbEntry;

 typedef struct
 {
@@ -70,7 +70,7 @@ typedef struct
 	char		old_name[NAMEDATALEN];
 	const char *password;
 	OpType		type;
-} RoleEntry;
+}			RoleEntry;

 /*
 * We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
 	struct DdlHashTable *prev_table;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
-} DdlHashTable;
+}			DdlHashTable;

 static DdlHashTable RootTable;
-static DdlHashTable *CurrentDdlTable = &RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;

 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
 {
 	char		str[ERROR_SIZE];
 	size_t		size;
-} ErrorString;
+}			ErrorString;

 static size_t
 ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -25,80 +25,79 @@

 #include <curl/curl.h>

-static int	extension_server_port = 0;
+static int extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;

-/*  to download all SQL (and data) files for an extension: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis */
-/*  it covers two possible extension files layouts: */
-/*  1. extension_name--version--platform.sql */
-/*  2. extension_name/extension_name--version.sql */
-/*     extension_name/extra_files.csv */
-/*  */
-/*  to download specific library file: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
-	CURLcode	res;
-	char	   *compute_ctl_url;
-	char	   *postdata;
-	bool		ret = false;
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;

-	if ((curl = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }

-	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-							   extension_server_port, filename, is_library ? "?is_library=true" : "");
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");

-	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);

-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);

-	if (curl)
-	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }

-		/* always cleanup */
-		curl_easy_cleanup(curl);
-	}
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }

-	return ret;
+    return ret;
 }

-void
-pg_init_extension_server()
+void pg_init_extension_server()
 {
-	/* Port to connect to compute_ctl on localhost */
-	/* to request extension files. */
-	DefineCustomIntVariable("neon.extension_server_port",
-							"connection string to the compute_ctl",
-							NULL,
-							&extension_server_port,
-							0, 0, INT_MAX,
-							PGC_POSTMASTER,
-							0,	/* no flags required */
-							NULL, NULL, NULL);
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);

-	/* set download_extension_file_hook */
-	prev_download_extension_file_hook = download_extension_file_hook;
-	download_extension_file_hook = neon_download_extension_file_http;
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
 }
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -67,33 +67,31 @@ typedef struct FileCacheEntry
 	BufferTag	key;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
 {
-	uint64		generation;		/* generation is needed to handle correct hash
-								 * reenabling */
-	uint32		size;			/* size of cache file in chunks */
-	uint32		used;			/* number of used chunks */
-	dlist_head	lru;			/* double linked list for LRU replacement
-								 * algorithm */
+	uint64 generation; /* generation is needed to handle correct hash reenabling */
+	uint32 size; /* size of cache file in chunks */
+	uint32 used; /* number of used chunks */
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
 } FileCacheControl;

-static HTAB *lfc_hash;
-static int	lfc_desc = 0;
+static HTAB* lfc_hash;
+static int   lfc_desc = 0;
 static LWLockId lfc_lock;
-static int	lfc_max_size;
-static int	lfc_size_limit;
-static char *lfc_path;
-static FileCacheControl *lfc_ctl;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif

-void		FileCacheMonitorMain(Datum main_arg);
+void FileCacheMonitorMain(Datum main_arg);

 /*
 * Local file cache is mandatory and Neon can work without it.
@@ -102,10 +100,10 @@ void		FileCacheMonitorMain(Datum main_arg);
 * All cache content should be invalidated to avoid reading of stale or corrupted data
 */
 static void
-lfc_disable(char const *op)
+lfc_disable(char const* op)
 {
 	HASH_SEQ_STATUS status;
-	FileCacheEntry *entry;
+	FileCacheEntry* entry;

 	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

@@ -139,10 +137,9 @@ lfc_ensure_opened(void)
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);

-		if (lfc_desc < 0)
-		{
+		if (lfc_desc < 0) {
 			lfc_disable("open");
 			return false;
 		}
@@ -153,7 +150,7 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
-	bool		found;
+	bool found;
 	static HASHCTL info;

 	if (prev_shmem_startup_hook)
@@ -163,21 +160,16 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
 	{
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
-
-		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 		lfc_hash = ShmemInitHash("lfc_hash",
-
-		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
-		 * of victim
-		 */
-								 lfc_size + 1, lfc_size + 1,
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -186,7 +178,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);

 		/* Remove file cache on restart */
-		(void) unlink(lfc_path);
+		(void)unlink(lfc_path);
 	}
 	LWLockRelease(AddinShmemInitLock);
 }
@@ -199,7 +191,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }

@@ -217,14 +209,11 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 static void
 lfc_change_limit_hook(int newval, void *extra)
 {
-	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);
-
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
 	/*
-	 * Stats collector detach shared memory, so we should not try to access
-	 * shared memory here. Parallel workers first assign default value (0), so
-	 * not perform truncation in parallel workers. The Postmaster can handle
-	 * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
-	 * but has no PGPROC.
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
 	if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
 		return;
@@ -232,9 +221,8 @@ lfc_change_limit_hook(int newval, void *extra)
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
-		if (lfc_desc < 0)
-		{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
 			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 			return;
@@ -243,15 +231,11 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
-		/*
-		 * Shrink cache by throwing away least recently accessed chunks and
-		 * returning their space to file system
-		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
 			elog(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
@@ -275,7 +259,7 @@ lfc_init(void)
 							"Maximal size of Neon local file cache",
 							NULL,
 							&lfc_max_size,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
@@ -288,7 +272,7 @@ lfc_init(void)
 							"Current limit for size of Neon local file cache",
 							NULL,
 							&lfc_size_limit,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_SIGHUP,
@@ -328,18 +312,18 @@ lfc_init(void)
 bool
 lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		found;
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found;
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -355,13 +339,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 void
 lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -389,10 +373,9 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	 */
 	if (entry->bitmap[chunk_offs >> 5] == 0)
 	{
-		bool		has_remaining_pages;
+		bool has_remaining_pages;

-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
-		{
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
 			if (entry->bitmap[i] != 0)
 			{
 				has_remaining_pages = true;
@@ -401,8 +384,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		}

 		/*
-		 * Put the entry at the position that is first to be reclaimed when we
-		 * have no cached pages remaining in the chunk
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
 		 */
 		if (!has_remaining_pages)
 		{
@@ -428,16 +411,16 @@ bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		result = true;
-	uint32		hash;
-	uint64		generation;
-	uint32		entry_offset;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+	uint64 generation;
+	uint32 entry_offset;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

 	if (!lfc_ensure_opened())
@@ -445,7 +428,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -464,7 +447,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("read");
@@ -492,31 +475,31 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
-			lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+		  char *buffer)
 #else
-					  const void *buffer)
+		  const void *buffer)
 #endif
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

 	if (!lfc_ensure_opened())
 		return;

 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	
 	CopyNRelFileInfoToBufTag(tag, rinfo);
-
+	
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -524,30 +507,24 @@ void

 	if (found)
 	{
-		/*
-		 * Unlink entry from LRU list to pin it for the duration of IO
-		 * operation
-		 */
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
 		if (entry->access_count++ == 0)
 			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
 		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations): 1. Wait until some of this operation is completed and
-		 * pages is unpinned 2. Allocate one more chunk, so that specified
-		 * cache size is more recommendation than hard limit. As far as
-		 * probability of such event (that all pages are pinned) is considered
-		 * to be very very small: there are should be very large number of
-		 * concurrent IO operations and them are limited by max_connections,
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
 		 * we prefer not to complicate code and use second approach.
 		 */
 		if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
@@ -556,14 +533,13 @@ void
 		else
 		{
 			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-												 * of file */
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
 		}
 		entry->access_count = 1;
 		memset(entry->bitmap, 0, sizeof entry->bitmap);
 	}

-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry->offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		LWLockRelease(lfc_lock);
@@ -625,9 +601,9 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
-		uint32		n_pages = 0;
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;

 		funcctx = SRF_FIRSTCALL_INIT();

@@ -677,8 +653,8 @@ local_cache_pages(PG_FUNCTION_ARGS)

 		LWLockAcquire(lfc_lock, LW_SHARED);

-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
 		{
 			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
@@ -704,14 +680,14 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		 * locks, so the information of each buffer is self-consistent.
 		 */
 		n_pages = 0;
-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
 		{
 			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 			{
 				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 				{
-					fctx->record[n_pages].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
 					fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 					fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 					fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -60,7 +60,7 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);

@@ -80,10 +80,11 @@ pageserver_connect(int elevel)
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
 	 * variable was set, use that as the password.
 	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
 	 */
 	n = 0;
 	if (neon_auth_token)
@@ -126,9 +127,9 @@ pageserver_connect(int elevel)

 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
 	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
+			  MyLatch, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
+			  NULL, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);

 	while (PQisBusy(pageserver_conn))
@@ -193,7 +194,6 @@ retry:
 			if (!PQconsumeInput(pageserver_conn))
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 				neon_log(LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
@@ -234,7 +234,7 @@ pageserver_disconnect(void)
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

@@ -249,12 +249,10 @@ pageserver_send(NeonRequest *request)

 	/*
 	 * If pageserver is stopped, the connections from compute node are broken.
-	 * The compute node doesn't notice that immediately, but it will cause the
-	 * next request to fail, usually on the next query. That causes
-	 * user-visible errors if pageserver is restarted, or the tenant is moved
-	 * from one pageserver to another. See
-	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
-	 * connection in case of failure.
+	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
+	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
+	 * See https://github.com/neondatabase/neon/issues/1138
+	 * So try to reestablish connection in case of failure.
 	 */
 	if (!connected)
 	{
@@ -277,7 +275,6 @@ pageserver_send(NeonRequest *request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 		pageserver_disconnect();
 		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
@@ -335,8 +332,7 @@ pageserver_receive(void)
 		}
 		else if (rc == -2)
 		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
+			char* msg = pchomp(PQerrorMessage(pageserver_conn));
 			pageserver_disconnect();
 			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
@@ -370,7 +366,6 @@ pageserver_flush(void)
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 			pageserver_disconnect();
 			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
@@ -473,10 +468,7 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;

-	/*
-	 * Retrieve the auth token to use when connecting to pageserver and
-	 * safekeepers
-	 */
+	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
 	neon_auth_token = getenv("NEON_AUTH_TOKEN");
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
--- a/pgxn/neon/libpqwalproposer.h
+++ b/pgxn/neon/libpqwalproposer.h
@@ -1,96 +0,0 @@
-/*
- * Interface to set of libpq wrappers walproposer and neon_walreader need.
- * Similar to libpqwalreceiver, but it has blocking connection establishment and
- * pqexec which don't fit us. Implementation is at walproposer_pg.c.
- */
-#ifndef ___LIBPQWALPROPOSER_H__
-#define ___LIBPQWALPROPOSER_H__
-
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
-/* Possible return values from walprop_async_read */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from walprop_async_write */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
-/*
- * This header is included by walproposer.h to define walproposer_api; if we're
- * building walproposer without pg, ignore libpq part, leaving only interface
- * types.
- */
-#ifndef WALPROPOSER_LIB
-
-#include "libpq-fe.h"
-
-/*
- * Sometimes working directly with underlying PGconn is simpler, export the
- * whole thing for simplicity.
- */
-typedef struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received CopyData message from
-								 * walprop_async_read */
-} WalProposerConn;
-
-extern WalProposerConn *libpqwp_connect_start(char *conninfo);
-extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
-extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
-extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
-extern void libpqwp_disconnect(WalProposerConn *conn);
-
-#endif							/* WALPROPOSER_LIB */
-#endif							/* ___LIBPQWALPROPOSER_H__ */
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,9 +48,9 @@ _PG_init(void)

 	pg_init_extension_server();

-	/* Important: This must happen after other parts of the extension */
-	/* are loaded, otherwise any settings to GUCs that were set before */
-	/* the extension was loaded will be removed. */
+	// Important: This must happen after other parts of the extension
+	// are loaded, otherwise any settings to GUCs that were set before
+	// the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
 * block_id; false otherwise.
 */
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -59,7 +59,7 @@

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers

-#else							/* major version >= 16 */
+#else /* major version >= 16 */

 #define USE_RELFILELOCATOR

@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#endif							/* //NEON_PGVERSIONCOMPAT_H */
+#endif //NEON_PGVERSIONCOMPAT_H
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -1,731 +0,0 @@
-/*
- * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
- * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
- *
- * We can't use libpqwalreceiver as it blocks during connection establishment
- * (and waiting for PQExec result), so use libpqwalproposer instead.
- *
- * TODO: keepalives are currently never sent, so the other side can close the
- * connection prematurely.
- *
- * TODO: close conn if reading takes too long to prevent stuck connections.
- */
-#include "postgres.h"
-
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
-#include "access/xlogreader.h"
-#include "libpq/pqformat.h"
-#include "storage/fd.h"
-#include "utils/wait_event.h"
-
-#include "libpq-fe.h"
-
-#include "neon_walreader.h"
-#include "walproposer.h"
-
-#define NEON_WALREADER_ERR_MSG_LEN 512
-
-/*
- * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
- */
-#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
-
-static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
-static void NeonWALReaderResetRemote(NeonWALReader *state);
-static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
-static void neon_wal_segment_close(NeonWALReader *state);
-static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
-								  TimeLineID tli);
-
-/*
- * State of connection to donor safekeeper.
- */
-typedef enum
-{
-	/* no remote connection */
-	RS_NONE,
-	/* doing PQconnectPoll, need readable socket */
-	RS_CONNECTING_READ,
-	/* doing PQconnectPoll, need writable socket */
-	RS_CONNECTING_WRITE,
-	/* Waiting for START_REPLICATION result */
-	RS_WAIT_EXEC_RESULT,
-	/* replication stream established */
-	RS_ESTABLISHED,
-} NeonWALReaderRemoteState;
-
-struct NeonWALReader
-{
-	/*
-	 * LSN before which we assume WAL is not available locally. Exists because
-	 * though first segment after startup always exists, part before
-	 * basebackup LSN is filled with zeros.
-	 */
-	XLogRecPtr	available_lsn;
-	WALSegmentContext segcxt;
-	WALOpenSegment seg;
-	int			wre_errno;
-	/* Explains failure to read, static for simplicity. */
-	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
-
-	/*
-	 * Saved info about request in progress, used to check validity of
-	 * arguments after resume and remember how far we accomplished it. req_lsn
-	 * is 0 if there is no request in progress.
-	 */
-	XLogRecPtr	req_lsn;
-	Size		req_len;
-	Size		req_progress;
-	WalProposer *wp;			/* we learn donor through walproposer */
-	char		donor_name[64]; /* saved donor safekeeper name for logging */
-	/* state of connection to safekeeper */
-	NeonWALReaderRemoteState rem_state;
-	WalProposerConn *wp_conn;
-
-	/*
-	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
-	 * NULL if there is no unprocessed message
-	 */
-	char	   *wal_ptr;
-	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
-
-	/*
-	 * LSN of wal_ptr position according to walsender to cross check against
-	 * read request
-	 */
-	XLogRecPtr	rem_lsn;
-
-	/* prepended to lines logged by neon_walreader, if provided */
-	char		log_prefix[64];
-};
-
-/* palloc and initialize NeonWALReader */
-NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
-{
-	NeonWALReader *reader;
-
-	reader = (NeonWALReader *)
-		palloc_extended(sizeof(NeonWALReader),
-						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
-	if (!reader)
-		return NULL;
-
-	reader->available_lsn = available_lsn;
-	reader->seg.ws_file = -1;
-	reader->seg.ws_segno = 0;
-	reader->seg.ws_tli = 0;
-	reader->segcxt.ws_segsize = wal_segment_size;
-
-	reader->wp = wp;
-
-	reader->rem_state = RS_NONE;
-
-	if (log_prefix)
-		strncpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
-
-	return reader;
-}
-
-void
-NeonWALReaderFree(NeonWALReader *state)
-{
-	if (state->seg.ws_file != -1)
-		neon_wal_segment_close(state);
-	if (state->wp_conn)
-		libpqwp_disconnect(state->wp_conn);
-	pfree(state);
-}
-
-/*
- * Like vanilla WALRead, but if requested position is before available_lsn or
- * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
- * advanced safekeeper.
- *
- * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
- * fetched from timeline 'tli'.
- *
- * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
- * occurs, in which case 'err' has the desciption. Error always closes remote
- * connection, if there was any, so socket subscription should be removed.
- *
- * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
- * NeonWALReaderSocket and call NeonWALRead again with exactly the same
- * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
- * docs during connection establishment (before first successful read) socket
- * underneath might change.
- *
- * Also, eventually walreader should switch from remote to local read; caller
- * should remove subscription to socket then by checking NeonWALReaderEvents
- * after successful read (otherwise next read might reopen the connection with
- * different socket).
- *
- * Reading not monotonically is not supported and will result in error.
- *
- * Caller should be sure that WAL up to requested LSN exists, otherwise
- * NEON_WALREAD_WOULDBLOCK might be always returned.
- */
-NeonWALReadResult
-NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	/*
-	 * If requested data is before known available basebackup lsn or there is
-	 * already active remote state, do remote read.
-	 */
-	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
-	{
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	if (NeonWALReadLocal(state, buf, startptr, count, tli))
-	{
-		return NEON_WALREAD_SUCCESS;
-	}
-	else if (state->wre_errno == ENOENT)
-	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	else
-	{
-		return NEON_WALREAD_ERROR;
-	}
-}
-
-/* Do the read from remote safekeeper. */
-static NeonWALReadResult
-NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	if (state->rem_state == RS_NONE)
-	{
-		XLogRecPtr	donor_lsn;
-
-		/* no connection yet; start one */
-		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
-
-		if (donor == NULL)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to establish remote connection to fetch WAL: no donor available");
-			return NEON_WALREAD_ERROR;
-		}
-		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
-		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
-				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
-		state->wp_conn = libpqwp_connect_start(donor->conninfo);
-		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to connect to %s to fetch WAL: immediately failed with %s",
-					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		/* we'll poll immediately */
-		state->rem_state = RS_CONNECTING_READ;
-	}
-
-	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
-	{
-		switch (PQconnectPoll(state->wp_conn->pg_conn))
-		{
-			case PGRES_POLLING_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "failed to connect to %s to fetch WAL: poll error: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			case PGRES_POLLING_READING:
-				state->rem_state = RS_CONNECTING_READ;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_WRITING:
-				state->rem_state = RS_CONNECTING_WRITE;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_OK:
-				{
-					/* connection successfully established */
-					char		start_repl_query[128];
-
-					snprintf(start_repl_query, sizeof(start_repl_query),
-							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
-							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
-					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
-							state->donor_name, start_repl_query);
-					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "failed to send %s query to %s: %s",
-								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-						NeonWALReaderResetRemote(state);
-						return NEON_WALREAD_ERROR;
-					}
-					state->rem_state = RS_WAIT_EXEC_RESULT;
-					break;
-				}
-
-			default:			/* there is unused PGRES_POLLING_ACTIVE */
-				Assert(false);
-				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
-		}
-	}
-
-	if (state->rem_state == RS_WAIT_EXEC_RESULT)
-	{
-		switch (libpqwp_get_query_result(state->wp_conn))
-		{
-			case WP_EXEC_SUCCESS_COPYBOTH:
-				state->rem_state = RS_ESTABLISHED;
-				break;
-			case WP_EXEC_NEEDS_INPUT:
-				return NEON_WALREAD_WOULDBLOCK;
-			case WP_EXEC_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s failed: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			default:			/* can't happen */
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s: unexpected result",
-						 state->donor_name);
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-		}
-	}
-
-	Assert(state->rem_state == RS_ESTABLISHED);
-
-	/*
-	 * If we had the request before, verify args are the same and advance the
-	 * result ptr according to the progress; otherwise register the request.
-	 */
-	if (state->req_lsn != InvalidXLogRecPtr)
-	{
-		if (state->req_lsn != startptr || state->req_len != count)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
-					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count,
-				state->req_progress);
-		buf += state->req_progress;
-	}
-	else
-	{
-		state->req_lsn = startptr;
-		state->req_len = count;
-		state->req_progress = 0;
-		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count);
-	}
-
-	while (true)
-	{
-		Size		to_copy;
-
-		/*
-		 * If we have no ready data, receive new message.
-		 */
-		if (state->wal_rem_len == 0 &&
-
-		/*
-		 * check for the sake of 0 length reads; walproposer does these for
-		 * heartbeats, though generally they shouldn't hit remote source.
-		 */
-			state->req_len - state->req_progress > 0)
-		{
-			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
-
-			if (read_msg_res != NEON_WALREAD_SUCCESS)
-				return read_msg_res;
-		}
-
-		if (state->req_lsn + state->req_progress != state->rem_lsn)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
-					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
-					 LSN_FORMAT_ARGS(state->rem_lsn),
-					 LSN_FORMAT_ARGS(state->req_lsn),
-					 state->req_len);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-
-		/* We can copy min of (available, requested) bytes. */
-		to_copy =
-			Min(state->req_len - state->req_progress, state->wal_rem_len);
-		memcpy(buf, state->wal_ptr, to_copy);
-		state->wal_ptr += to_copy;
-		state->wal_rem_len -= to_copy;
-		state->rem_lsn += to_copy;
-		if (state->wal_rem_len == 0)
-			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
-		buf += to_copy;
-		state->req_progress += to_copy;
-		if (state->req_progress == state->req_len)
-		{
-			XLogSegNo	next_segno;
-			XLogSegNo	req_segno;
-
-			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
-			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
-
-			/*
-			 * Request completed. If there is a chance of serving next one
-			 * locally, close the connection.
-			 */
-			if (state->req_lsn < state->available_lsn &&
-				state->rem_lsn >= state->available_lsn)
-			{
-				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
-						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
-			         is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
-			{
-				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
-						LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			state->req_lsn = InvalidXLogRecPtr;
-			state->req_len = 0;
-			state->req_progress = 0;
-			return NEON_WALREAD_SUCCESS;
-		}
-	}
-}
-
-/*
- * Read one WAL message from the stream, sets state->wal_ptr in case of success.
- * Resets remote state in case of failure.
- */
-static NeonWALReadResult
-NeonWALReaderReadMsg(NeonWALReader *state)
-{
-	while (true)				/* loop until we get 'w' */
-	{
-		char	   *copydata_ptr;
-		int			copydata_size;
-		StringInfoData s;
-		char		msg_type;
-		int			hdrlen;
-
-		Assert(state->rem_state == RS_ESTABLISHED);
-		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
-
-		switch (libpqwp_async_read(state->wp_conn,
-								   &copydata_ptr,
-								   &copydata_size))
-		{
-			case PG_ASYNC_READ_SUCCESS:
-				break;
-			case PG_ASYNC_READ_TRY_AGAIN:
-				return NEON_WALREAD_WOULDBLOCK;
-			case PG_ASYNC_READ_FAIL:
-				snprintf(state->err_msg,
-						 sizeof(state->err_msg),
-						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
-						 LSN_FORMAT_ARGS(state->req_lsn),
-						 state->req_len,
-						 state->req_progress,
-						 PQerrorMessage(state->wp_conn->pg_conn));
-				goto err;
-		}
-
-		/* put data on StringInfo to parse */
-		s.data = copydata_ptr;
-		s.len = copydata_size;
-		s.cursor = 0;
-		s.maxlen = -1;
-
-		if (copydata_size == 0)
-		{
-			snprintf(state->err_msg,
-					 sizeof(state->err_msg),
-					 "zero length copydata received");
-			goto err;
-		}
-		msg_type = pq_getmsgbyte(&s);
-		switch (msg_type)
-		{
-			case 'w':
-				{
-					XLogRecPtr	start_lsn;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg,
-								 sizeof(state->err_msg),
-								 "invalid WAL message received from primary");
-						goto err;
-					}
-
-					start_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
-					pq_getmsgint64(&s); /* TimestampTz send_time */
-
-					state->rem_lsn = start_lsn;
-					state->wal_rem_len = (Size) (s.len - s.cursor);
-					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
-					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
-							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
-
-					return NEON_WALREAD_SUCCESS;
-				}
-			case 'k':
-				{
-					XLogRecPtr	end_lsn;
-					bool		reply_requested;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "invalid keepalive message received from primary");
-						goto err;
-					}
-
-					end_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* TimestampTz timestamp; */
-					reply_requested = pq_getmsgbyte(&s);
-					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
-							LSN_FORMAT_ARGS(end_lsn),
-							reply_requested);
-					if (end_lsn < state->req_lsn + state->req_len)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
-								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
-						goto err;
-					}
-					continue;
-				}
-			default:
-				nwr_log(WARNING, "invalid replication message type %d", msg_type);
-				continue;
-		}
-	}
-err:
-	NeonWALReaderResetRemote(state);
-	return NEON_WALREAD_ERROR;
-}
-
-/* reset remote connection and request in progress */
-static void
-NeonWALReaderResetRemote(NeonWALReader *state)
-{
-	state->req_lsn = InvalidXLogRecPtr;
-	state->req_len = 0;
-	state->req_progress = 0;
-	state->rem_state = RS_NONE;
-	if (state->wp_conn)
-	{
-		libpqwp_disconnect(state->wp_conn);
-		state->wp_conn = NULL;
-	}
-	state->donor_name[0] = '\0';
-	state->wal_ptr = NULL;
-	state->wal_rem_len = 0;
-	state->rem_lsn = InvalidXLogRecPtr;
-}
-
-/*
- * Return socket of connection to remote source. Must be called only when
- * connection exists (NeonWALReaderEvents returns non zero).
- */
-pgsocket
-NeonWALReaderSocket(NeonWALReader *state)
-{
-	if (!state->wp_conn)
-		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
-	return PQsocket(state->wp_conn->pg_conn);
-}
-
-/*
- * Returns events user should wait on connection socket or 0 if remote
- * connection is not active.
- */
-extern uint32
-NeonWALReaderEvents(NeonWALReader *state)
-{
-	switch (state->rem_state)
-	{
-		case RS_NONE:
-			return 0;
-		case RS_CONNECTING_READ:
-			return WL_SOCKET_READABLE;
-		case RS_CONNECTING_WRITE:
-			return WL_SOCKET_WRITEABLE;
-		case RS_WAIT_EXEC_RESULT:
-		case RS_ESTABLISHED:
-			return WL_SOCKET_READABLE;
-		default:
-			Assert(false);
-			return 0;			/* make compiler happy */
-	}
-}
-
-static bool
-NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	char	   *p;
-	XLogRecPtr	recptr;
-	Size		nbytes;
-
-	p = buf;
-	recptr = startptr;
-	nbytes = count;
-
-	while (nbytes > 0)
-	{
-		uint32		startoff;
-		int			segbytes;
-		int			readbytes;
-
-		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
-
-		/*
-		 * If the data we want is not in a segment we have open, close what we
-		 * have (if anything) and open the next one, using the caller's
-		 * provided openSegment callback.
-		 */
-		if (state->seg.ws_file < 0 ||
-			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
-			tli != state->seg.ws_tli)
-		{
-			XLogSegNo	nextSegNo;
-
-			neon_wal_segment_close(state);
-
-			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
-			if (!neon_wal_segment_open(state, nextSegNo, &tli))
-			{
-				char		fname[MAXFNAMELEN];
-
-				state->wre_errno = errno;
-
-				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
-				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
-						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
-				return false;
-			}
-
-			/* This shouldn't happen -- indicates a bug in segment_open */
-			Assert(state->seg.ws_file >= 0);
-
-			/* Update the current segment info. */
-			state->seg.ws_tli = tli;
-			state->seg.ws_segno = nextSegNo;
-		}
-
-		/* How many bytes are within this segment? */
-		if (nbytes > (state->segcxt.ws_segsize - startoff))
-			segbytes = state->segcxt.ws_segsize - startoff;
-		else
-			segbytes = nbytes;
-
-#ifndef FRONTEND
-		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-#endif
-
-		/* Reset errno first; eases reporting non-errno-affecting errors */
-		errno = 0;
-		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
-
-#ifndef FRONTEND
-		pgstat_report_wait_end();
-#endif
-
-		if (readbytes <= 0)
-		{
-			char		fname[MAXFNAMELEN];
-
-			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
-
-			if (readbytes < 0)
-			{
-				state->wre_errno = errno;
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
-						 fname, startoff, strerror(state->wre_errno));
-			}
-			else
-			{
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
-						 fname, startoff);
-			}
-			return false;
-		}
-
-		/* Update state for read */
-		recptr += readbytes;
-		nbytes -= readbytes;
-		p += readbytes;
-	}
-
-	return true;
-}
-
-/*
- * Copy of vanilla wal_segment_open, but returns false in case of error instead
- * of ERROR, with errno set.
- *
- * XLogReaderRoutine->segment_open callback for local pg_wal files
- */
-static bool
-neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
-					  TimeLineID *tli_p)
-{
-	TimeLineID	tli = *tli_p;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
-	nwr_log(LOG, "opening %s", path);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return true;
-
-	return false;
-}
-
-static bool
-is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
-{
-	struct stat stat_buffer;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, segno, segsize);
-	return stat(path, &stat_buffer) == 0;
-}
-
-/* copy of vanilla wal_segment_close with NeonWALReader */
-static void
-neon_wal_segment_close(NeonWALReader *state)
-{
-	if (state->seg.ws_file >= 0)
-	{
-		close(state->seg.ws_file);
-		/* need to check errno? */
-		state->seg.ws_file = -1;
-	}
-}
-
-char *
-NeonWALReaderErrMsg(NeonWALReader *state)
-{
-	return state->err_msg;
-}
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -1,29 +0,0 @@
-#ifndef __NEON_WALREADER_H__
-#define __NEON_WALREADER_H__
-
-#include "access/xlogdefs.h"
-
-/* forward declare so we don't have to expose the struct to the public */
-struct NeonWALReader;
-typedef struct NeonWALReader NeonWALReader;
-
-/* avoid including walproposer.h as it includes us */
-struct WalProposer;
-typedef struct WalProposer WalProposer;
-
-/* NeonWALRead return value */
-typedef enum
-{
-	NEON_WALREAD_SUCCESS,
-	NEON_WALREAD_WOULDBLOCK,
-	NEON_WALREAD_ERROR,
-} NeonWALReadResult;
-
-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
-extern void NeonWALReaderFree(NeonWALReader *state);
-extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
-extern uint32 NeonWALReaderEvents(NeonWALReader *state);
-extern char *NeonWALReaderErrMsg(NeonWALReader *state);
-
-#endif							/* __NEON_WALREADER_H__ */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -40,13 +40,13 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
-} NeonMessageTag;
+}			NeonMessageTag;

 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonMessage;
+}			NeonMessage;

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

@@ -67,27 +67,27 @@ typedef struct
 	NeonMessageTag tag;
 	bool		latest;			/* if true, request latest page version */
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} NeonRequest;
+}			NeonRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonExistsRequest;
+}			NeonExistsRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonNblocksRequest;
+}			NeonNblocksRequest;

 typedef struct
 {
 	NeonRequest req;
 	Oid			dbNode;
-} NeonDbSizeRequest;
+}			NeonDbSizeRequest;

 typedef struct
 {
@@ -95,31 +95,31 @@ typedef struct
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-} NeonGetPageRequest;
+}			NeonGetPageRequest;

 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonResponse;
+}			NeonResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	bool		exists;
-} NeonExistsResponse;
+}			NeonExistsResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	uint32		n_blocks;
-} NeonNblocksResponse;
+}			NeonNblocksResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
-} NeonGetPageResponse;
+}			NeonGetPageResponse;

 #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))

@@ -127,18 +127,18 @@ typedef struct
 {
 	NeonMessageTag tag;
 	int64		db_size;
-} NeonDbSizeResponse;
+}			NeonDbSizeResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
-} NeonErrorResponse;
+}			NeonErrorResponse;

-extern StringInfoData nm_pack_request(NeonRequest *msg);
-extern NeonResponse *nm_unpack_response(StringInfo s);
-extern char *nm_to_string(NeonMessage *msg);
+extern StringInfoData nm_pack_request(NeonRequest * msg);
+extern NeonResponse * nm_unpack_response(StringInfo s);
+extern char *nm_to_string(NeonMessage * msg);

 /*
 * API
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage *msg);

 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
+	bool		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
 	bool		(*flush) (void);
-} page_server_api;
+}			page_server_api;

 extern void prefetch_on_ps_disconnect(void);

-extern page_server_api *page_server;
+extern page_server_api * page_server;

 extern char *page_server_connstring;
-extern int	flush_every_n_requests;
-extern int	readahead_buffer_size;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
+extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -63,6 +63,7 @@
 #include "storage/md.h"
 #include "pgstat.h"

+
 #if PG_VERSION_NUM >= 150000
 #include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
@@ -100,21 +101,21 @@ typedef enum
 	UNLOGGED_BUILD_PHASE_1,
 	UNLOGGED_BUILD_PHASE_2,
 	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
+}			UnloggedBuildPhase;

 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * Prefetch implementation:
- *
+ * 
 * Prefetch is performed locally by each backend.
 *
 * There can be up to readahead_buffer_size active IO requests registered at
 * any time. Requests using smgr_prefetch are sent to the pageserver, but we
 * don't wait on the response. Requests using smgr_read are either read from
 * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages.
+ * this also will allow us to receive other prefetched pages. 
 * Each request is immediately written to the output buffer of the pageserver
 * connection, but may not be flushed if smgr_prefetch is used: pageserver
 * flushes sent requests on manual flush, or every neon.flush_output_after
@@ -138,7 +139,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * State machine:
- *
+ *        
 * not in hash : in hash
 *             :
 * UNUSED ------> REQUESTED --> RECEIVED
@@ -149,34 +150,30 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 *   +----------------+------------+
 *             :
 */
-typedef enum PrefetchStatus
-{
-	PRFS_UNUSED = 0,			/* unused slot */
-	PRFS_REQUESTED,				/* request was written to the sendbuffer to
-								 * PS, but not necessarily flushed. all fields
-								 * except response valid */
-	PRFS_RECEIVED,				/* all fields valid */
-	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
-								 * valid */
+typedef enum PrefetchStatus {
+	PRFS_UNUSED = 0,	/* unused slot */
+	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
+						 * necessarily flushed.
+						 * all fields except response valid */
+	PRFS_RECEIVED,		/* all fields valid */
+	PRFS_TAG_REMAINS,	/* only buftag and my_ring_index are still valid */
 } PrefetchStatus;

-typedef struct PrefetchRequest
-{
-	BufferTag	buftag;			/* must be first entry in the struct */
+typedef struct PrefetchRequest {
+	BufferTag	buftag; /* must be first entry in the struct */
 	XLogRecPtr	effective_request_lsn;
 	XLogRecPtr	actual_request_lsn;
-	NeonResponse *response;		/* may be null */
+	NeonResponse *response; /* may be null */
 	PrefetchStatus status;
 	uint64		my_ring_index;
 } PrefetchRequest;

 /* prefetch buffer lookup hash table */

-typedef struct PrfHashEntry
-{
+typedef struct PrfHashEntry {
 	PrefetchRequest *slot;
-	uint32		status;
-	uint32		hash;
+	uint32 status;
+	uint32 hash;
 } PrfHashEntry;

 #define SH_PREFIX			prfh
@@ -200,42 +197,36 @@ typedef struct PrfHashEntry
 /*
 * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
 * It maintains a (ring) buffer of in-flight requests and responses.
- *
+ * 
 * We maintain several indexes into the ring buffer:
 * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- *
+ * 
 * ring_unused points to the first unused slot of the buffer
 * ring_receive is the next request that is to be received
 * ring_last is the oldest received entry in the buffer
- *
+ * 
 * Apart from being an entry in the ring buffer of prefetch requests, each
 * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
 */
-typedef struct PrefetchState
-{
-	MemoryContext bufctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext errctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext hashctx;		/* context for prf_buffer */
+typedef struct PrefetchState {
+	MemoryContext bufctx; /* context for prf_buffer[].response allocations */
+	MemoryContext errctx; /* context for prf_buffer[].response allocations */
+	MemoryContext hashctx; /* context for prf_buffer */

 	/* buffer indexes */
-	uint64		ring_unused;	/* first unused slot */
-	uint64		ring_flush;		/* next request to flush */
-	uint64		ring_receive;	/* next slot that is to receive a response */
-	uint64		ring_last;		/* min slot with a response value */
+	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
+	uint64	ring_receive;		/* next slot that is to receive a response */
+	uint64	ring_last;			/* min slot with a response value */

 	/* metrics / statistics  */
-	int			n_responses_buffered;	/* count of PS responses not yet in
-										 * buffers */
-	int			n_requests_inflight;	/* count of PS requests considered in
-										 * flight */
-	int			n_unused;		/* count of buffers < unused, > last, that are
-								 * also unused */
+	int		n_responses_buffered;	/* count of PS responses not yet in buffers */
+	int		n_requests_inflight;	/* count of PS requests considered in flight */
+	int		n_unused;				/* count of buffers < unused, > last, that are also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
-	PrefetchRequest prf_buffer[];	/* prefetch buffers */
+	prfh_hash *prf_hash;
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

 PrefetchState *MyPState;
@@ -273,10 +264,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 static bool
 compact_prefetch_buffers(void)
 {
-	uint64		empty_ring_index = MyPState->ring_last;
-	uint64		search_ring_index = MyPState->ring_receive;
-	int			n_moved = 0;
-
+	uint64	empty_ring_index = MyPState->ring_last;
+	uint64	search_ring_index = MyPState->ring_receive;
+	int n_moved = 0;
+	
 	if (MyPState->ring_receive == MyPState->ring_last)
 		return false;

@@ -291,14 +282,15 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Here we have established: slots < search_ring_index have an unknown
-	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
-	 * are unused slots > empty_ring_index are in use, or outside our buffer's
-	 * range. ... unless search_ring_index <= ring_last
-	 *
+	 * Here we have established:
+	 *   slots < search_ring_index have an unknown state (not scanned)
+	 *   slots >= search_ring_index and <= empty_ring_index are unused
+	 *   slots > empty_ring_index are in use, or outside our buffer's range.
+	 * ... unless search_ring_index <= ring_last
+	 * 
 	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as
-	 * we hit more unused items while moving backwards through the array.
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as we hit
+	 * more unused items while moving backwards through the array.
 	 */

 	while (search_ring_index > MyPState->ring_last)
@@ -338,10 +330,7 @@ compact_prefetch_buffers(void)

 		/* empty the moved slot */
 		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag)
-		{
-			0
-		};
+		source_slot->buftag = (BufferTag) {0};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->effective_request_lsn = 0;
@@ -351,8 +340,8 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Only when we've moved slots we can expect trailing unused slots, so
-	 * only then we clean up trailing unused slots.
+	 * Only when we've moved slots we can expect trailing unused slots,
+	 * so only then we clean up trailing unused slots.
 	 */
 	if (n_moved > 0)
 	{
@@ -369,10 +358,10 @@ readahead_buffer_resize(int newsize, void *extra)
 	uint64		end,
 				nfree = newsize;
 	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
-																	  sizeof(PrefetchRequest) * newsize
-		);
-
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * newsize
+	);
+	
 	/* don't try to re-initialize if we haven't initialized yet */
 	if (MyPState == NULL)
 		return;
@@ -399,12 +388,12 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;

-	/*
+	/* 
 	 * Copy over the prefetches.
-	 *
+	 * 
 	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one
-	 * iteration on the dataset, and trivial compaction.
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
 	 */
 	for (end = MyPState->ring_unused - 1;
 		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
@@ -412,7 +401,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
 		PrefetchRequest *newslot;
-		bool		found;
+		bool	found;

 		if (slot->status == PRFS_UNUSED)
 			continue;
@@ -475,11 +464,10 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup_trailing_unused(void)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest *slot;

-	while (MyPState->ring_last < MyPState->ring_receive)
-	{
+	while (MyPState->ring_last < MyPState->ring_receive) {
 		ring_index = MyPState->ring_last;
 		slot = GetPrfSlot(ring_index);

@@ -493,7 +481,7 @@ prefetch_cleanup_trailing_unused(void)
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
- *
+ * 
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
@@ -525,7 +513,7 @@ prefetch_wait_for(uint64 ring_index)

 /*
 * Read the response of a prefetch request into its slot.
- *
+ * 
 * The caller is responsible for making sure that the request for this buffer
 * was flushed to the PageServer.
 *
@@ -565,7 +553,7 @@ prefetch_read(PrefetchRequest *slot)

 /*
 * Disconnect hook - drop prefetches when the connection drops
- *
+ * 
 * If we don't remove the failed prefetches, we'd be serving incorrect
 * data to the smgr.
 */
@@ -576,7 +564,7 @@ prefetch_on_ps_disconnect(void)
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		uint64		ring_index = MyPState->ring_receive;
+		uint64 ring_index = MyPState->ring_receive;

 		slot = GetPrfSlot(ring_index);

@@ -606,7 +594,7 @@ prefetch_set_unused(uint64 ring_index)
 	PrefetchRequest *slot = GetPrfSlot(ring_index);

 	if (ring_index < MyPState->ring_last)
-		return;					/* Should already be unused */
+		return; /* Should already be unused */

 	Assert(MyPState->ring_unused > ring_index);

@@ -637,11 +625,7 @@ prefetch_set_unused(uint64 ring_index)
 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
 		prefetch_cleanup_trailing_unused();
-
-	/*
-	 * ... and try to store the buffered responses more compactly if > 12.5%
-	 * of the buffer is gaps
-	 */
+	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
 	else if (ReceiveBufferNeedsCompaction())
 		compact_prefetch_buffers();
 }
@@ -649,7 +633,7 @@ prefetch_set_unused(uint64 ring_index)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	bool		found;
+	bool found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -667,22 +651,21 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
+		XLogRecPtr lsn = neon_get_request_lsn(
+			&request.req.latest,
+			BufTagGetNRelFileInfo(slot->buftag),
+			slot->buftag.forkNum,
+			slot->buftag.blockNum
+		);
 		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
+		 * Note: effective_request_lsn is potentially higher than the requested
+		 * LSN, but still correct:
+		 * 
 		 * We know there are no changes between the actual requested LSN and
 		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
+		 * have been in cache and evicted between those LSN values, which
+		 * then would have had to result in a larger request LSN for this page.
+		 * 
 		 * It is possible that a concurrent backend loads the page, modifies
 		 * it and then evicts it again, but the LSN of that eviction cannot be
 		 * smaller than the current WAL insert/redo pointer, which is already
@@ -719,7 +702,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 * prefetch_register_buffer() - register and prefetch buffer
 *
 * Register that we may want the contents of BufferTag in the near future.
- *
+ * 
 * If force_latest and force_lsn are not NULL, those values are sent to the
 * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
 * to fill in these values manually.
@@ -731,14 +714,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-Retry:
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -758,10 +741,7 @@ Retry:
 		 */
 		if (force_latest && force_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
 			if (*force_latest)
 			{
 				if (*force_lsn > slot->effective_request_lsn)
@@ -772,11 +752,7 @@ Retry:
 				}

 			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
 			{
 				if (*force_lsn != slot->effective_request_lsn)
@@ -823,8 +799,7 @@ Retry:
 	 */
 	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		uint64		cleanup_index = MyPState->ring_last;
-
+		uint64 cleanup_index = MyPState->ring_last;
 		slot = GetPrfSlot(cleanup_index);

 		Assert(slot->status != PRFS_UNUSED);
@@ -839,10 +814,7 @@ Retry:
 		}
 		else
 		{
-			/*
-			 * We have the slot for ring_last, so that must still be in
-			 * progress
-			 */
+			/* We have the slot for ring_last, so that must still be in progress */
 			switch (slot->status)
 			{
 				case PRFS_REQUESTED:
@@ -861,8 +833,8 @@ Retry:
 	}

 	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-	 * we can insert the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
 	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
@@ -888,10 +860,7 @@ Retry:
 	{
 		if (!page_server->flush())
 		{
-			/*
-			 * Prefetch set is reset in case of error, so we should try to
-			 * register our request once again
-			 */
+			/* Prefetch set is reset in case of error, so we should try to register our request once again */
 			goto Retry;
 		}
 		MyPState->ring_flush = MyPState->ring_unused;
@@ -903,10 +872,8 @@ Retry:
 static NeonResponse *
 page_server_request(void const *req)
 {
-	NeonResponse *resp;
-
-	do
-	{
+	NeonResponse* resp;
+	do {
 		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
@@ -918,7 +885,7 @@ page_server_request(void const *req)


 StringInfoData
-nm_pack_request(NeonRequest *msg)
+nm_pack_request(NeonRequest * msg)
 {
 	StringInfoData s;

@@ -1034,7 +1001,7 @@ nm_unpack_response(StringInfo s)
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
-
+				
 				Assert(msg_resp->tag == T_NeonGetPageResponse);

 				resp = (NeonResponse *) msg_resp;
@@ -1090,7 +1057,7 @@ nm_unpack_response(StringInfo s)

 /* dump to json for debugging / error reporting purposes */
 char *
-nm_to_string(NeonMessage *msg)
+nm_to_string(NeonMessage * msg)
 {
 	StringInfoData s;

@@ -1219,7 +1186,7 @@ nm_to_string(NeonMessage *msg)
 * directly because it skips the logging if the LSN is new enough.
 */
 static XLogRecPtr
-log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
+log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 Page page, bool page_std)
 {
 	PGAlignedBlock copied_buffer;
@@ -1242,11 +1209,11 @@ PageIsEmptyHeapPage(char *buffer)
 }

 static void
-			neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-							 char *buffer, bool force)
+				 char *buffer, bool force)
 #else
-							 const char *buffer, bool force)
+				 const char *buffer, bool force) 
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
@@ -1346,24 +1313,24 @@ static void
 void
 neon_init(void)
 {
-	Size		prfs_size;
+	Size prfs_size;

 	if (MyPState != NULL)
 		return;

 	prfs_size = offsetof(PrefetchState, prf_buffer) + (
-													   sizeof(PrefetchRequest) * readahead_buffer_size
-		);
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);

 	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-
+	
 	MyPState->n_unused = readahead_buffer_size;

 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
 										 SLAB_DEFAULT_BLOCK_SIZE * 17,
 										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext, 
 											 "NeonSMGR/errors",
 											 ALLOCSET_DEFAULT_SIZES);
 	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
@@ -1428,6 +1395,12 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
 		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
+	else if (am_walsender)
+	{
+		*latest = true;
+		lsn = InvalidXLogRecPtr;
+		elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 ");
+	}
 	else
 	{
 		XLogRecPtr	flushlsn;
@@ -1603,14 +1576,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
-	 * Note that in REDO, this is called to make sure the relation fork
-	 * exists, but it does not truncate the relation. So, we can only update
-	 * the relsize if it didn't exist before.
-	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
 	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's file length
-	 * considerations, and as file extension isn't (perfectly) logged, we need
-	 * to take care of that before we hit file size checks.
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
@@ -1686,7 +1659,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #endif
 {
 	XLogRecPtr	lsn;
-	BlockNumber n_blocks = 0;
+	BlockNumber	n_blocks = 0;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1727,10 +1700,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/*
-	 * Usually Postgres doesn't extend relation on more than one page (leaving
-	 * holes). But this rule is violated in PG-15 where
-	 * CreateAndCopyRelationData call smgrextend for destination relation n
-	 * using size of source relation
+	 * Usually Postgres doesn't extend relation on more than one page
+	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
+	 * call smgrextend for destination relation n using size of source relation
 	 */
 	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
@@ -1751,13 +1723,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
-
 	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
+	 * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
+	 * An smgr_write() call will come for the buffer later, after it has been initialized
+	 * with the real page contents, and it is eventually evicted from the buffer cache.
+	 * But we need a valid LSN to the relation metadata update now.
 	 */
 	if (lsn == InvalidXLogRecPtr)
 	{
@@ -1816,9 +1786,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forkNum),
-						InvalidBlockNumber)));
+					errmsg("cannot extend file \"%s\" beyond %u blocks",
+						   relpath(reln->smgr_rlocator, forkNum),
+						   InvalidBlockNumber)));

 	/* Don't log any pages if we're not allowed to do so. */
 	if (!XLogInsertAllowed())
@@ -1905,7 +1875,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	switch (reln->smgr_relpersistence)
 	{
-		case 0:					/* probably shouldn't happen, but ignore it */
+		case 0: /* probably shouldn't happen, but ignore it */
 		case RELPERSISTENCE_PERMANENT:
 			break;

@@ -1920,10 +1890,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;

-	tag = (BufferTag)
-	{
+	tag = (BufferTag) {
 		.forkNum = forknum,
-			.blockNum = blocknum
+		.blockNum = blocknum
 	};
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

@@ -1978,11 +1947,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 * To avoid breaking tests in the runtime please keep function signature in sync.
 */
 #if PG_MAJORVERSION_NUM < 16
-void		PGDLLEXPORT
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
-void		PGDLLEXPORT
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
@@ -1993,21 +1962,21 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;

-	buftag = (BufferTag)
-	{
+	buftag = (BufferTag) {
 		.forkNum = forkNum,
-			.blockNum = blkno,
+		.blockNum = blkno,
 	};

 	CopyNRelFileInfoToBufTag(buftag, rinfo);

 	/*
 	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the page
-	 * after redo has decided it won't redo that page and updated the LwLSN
-	 * for that page. If we're in hot standby we need to take care that we
-	 * don't return until after REDO has finished replaying up to that LwLSN,
-	 * as the page should have been locked up to that point.
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
 	 *
 	 * See also the description on neon_redo_read_buffer_filter below.
 	 *
@@ -2015,7 +1984,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * concurrent failed read IOs. Those IOs should never have a request_lsn
 	 * that is as large as the WAL record we're currently replaying, if it
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found.
+	 * value of the LwLsn cache when the entry is not found. 
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
 		XLogWaitForReplayOf(request_lsn);
@@ -2033,14 +2002,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else /* the current prefetch LSN is not large enough, so drop the prefetch */
 		{
 			/*
 			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance
-			 * is large enough and a backend didn't consume all prefetch
-			 * requests.
+			 * unlikely this happens, but it can happen if prefetch distance is
+			 * large enough and a backend didn't consume all prefetch requests.
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
@@ -2067,11 +2034,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			/*
-			 * Empty our reference to the prefetch buffer's hash entry. When
-			 * we wait for prefetches, the entry reference is invalidated by
-			 * potential updates to the hash, and when we reconnect to the
-			 * pageserver the prefetch we're waiting for may be dropped, in
-			 * which case we need to retry and take the branch above.
+			 * Empty our reference to the prefetch buffer's hash entry.
+			 * When we wait for prefetches, the entry reference is invalidated by 
+			 * potential updates to the hash, and when we reconnect to the 
+			 * pageserver the prefetch we're waiting for may be dropped,
+			 * in which case we need to retry and take the branch above.
 			 */
 			entry = NULL;
 		}
@@ -2119,11 +2086,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 *	neon_read() -- Read the specified block from a relation.
 */
 void
-			neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+		  char *buffer)
 #else
-					  void *buffer)
+		  void *buffer)
 #endif
 {
 	bool		latest;
@@ -2258,11 +2225,11 @@ hexdump_page(char *page)
 *		use mdextend().
 */
 void
-			neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-					   char *buffer, bool skipFsync)
+		   char *buffer, bool skipFsync)
 #else
-					   const void *buffer, bool skipFsync)
+		   const void *buffer, bool skipFsync)
 #endif
 {
 	XLogRecPtr	lsn;
@@ -2764,7 +2731,7 @@ smgr_init_neon(void)

 /*
 * Return whether we can skip the redo for this block.
- *
+ * 
 * The conditions for skipping the IO are:
 *
 * - The block is not in the shared buffers, and
@@ -2803,7 +2770,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecPtr	end_recptr = record->EndRecPtr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-	BlockNumber blkno;
+	BlockNumber	blkno;
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
@@ -2823,8 +2790,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	/*
 	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers. See also
-	 * this function's top comment.
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
 	 */
 	if (!OidIsValid(NInfoGetDbOid(rinfo)))
 		return false;
@@ -2850,9 +2817,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

-	/*
-	 * we don't have the buffer in memory, update lwLsn past this record, also
-	 * evict page fro file cache
+	/* we don't have the buffer in memory, update lwLsn past this record,
+	 * also evict page fro file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);
@@ -2872,11 +2838,11 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/*
-		 * Size was not cached. We populate the cache now, with the size of
-		 * the relation measured after this WAL record is applied.
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
 		 *
-		 * This length is later reused when we open the smgr to read the
-		 * block, which is fine and expected.
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
 		 */

 		NeonResponse *response;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,9 +10,6 @@
 #include "utils/uuid.h"
 #include "replication/walreceiver.h"

-#include "libpqwalproposer.h"
-#include "neon_walreader.h"
-
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2

@@ -25,9 +22,43 @@
 */
 #define WL_NO_EVENTS 0

-struct WalProposerConn;			/* Defined in libpqwalproposer.h */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
 /*
 * WAL safekeeper state, which is used to wait for some event.
 *
@@ -104,40 +135,6 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;

-/*
- * Sending WAL substates of SS_ACTIVE.
- */
-typedef enum
-{
-	/*
-	 * We are ready to send more WAL, waiting for latch set to learn about
-	 * more WAL becoming available (or just a timeout to send heartbeat).
-	 */
-	SS_ACTIVE_SEND,
-
-	/*
-	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
-	 * send to this safekeeper.
-	 *
-	 * Note: socket management is done completely inside walproposer_pg for
-	 * simplicity, and thus simulation doesn't test it. Which is fine as
-	 * simulation is mainly aimed at consensus checks, not waiteventset
-	 * management.
-	 *
-	 * Also, while in this state we don't touch safekeeper socket, so in
-	 * theory it might close connection as inactive. This can be addressed if
-	 * needed; however, while fetching WAL we should regularly send it, so the
-	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
-	 * walreader socket), but similarly shouldn't be a problem.
-	 */
-	SS_ACTIVE_READ_WAL,
-
-	/*
-	 * Waiting for write readiness to flush the socket.
-	 */
-	SS_ACTIVE_FLUSH,
-} SafekeeperActiveState;
-
 /* Consensus logical timestamp. */
 typedef uint64 term_t;

@@ -336,30 +333,6 @@ typedef struct Safekeeper
 	 */
 	char		conninfo[MAXCONNINFO];

-	/*
-	 * Temporary buffer for the message being sent to the safekeeper.
-	 */
-	StringInfoData outbuf;
-
-	/*
-	 * Streaming will start here; must be record boundary.
-	 */
-	XLogRecPtr	startStreamingAt;
-
-	XLogRecPtr	streamingAt;	/* current streaming position */
-	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */
-
-	SafekeeperState state;		/* safekeeper state machine state */
-	SafekeeperActiveState active_state;
-	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
-	AcceptorGreeting greetResponse; /* acceptor greeting */
-	VoteResponse voteResponse;	/* the vote */
-	AppendResponse appendResponse;	/* feedback for master */
-
-
-	/* postgres-specific fields */
-#ifndef WALPROPOSER_LIB
-
 	/*
 	 * postgres protocol connection to the WAL acceptor
 	 *
@@ -368,32 +341,33 @@ typedef struct Safekeeper
 	 */
 	WalProposerConn *conn;

+	/*
+	 * Temporary buffer for the message being sent to the safekeeper.
+	 */
+	StringInfoData outbuf;
+
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	NeonWALReader *xlogreader;
+	XLogReaderState *xlogreader;

 	/*
-	 * Position in wait event set. Equal to -1 if no event
+	 * Streaming will start here; must be record boundary.
 	 */
-	int			eventPos;
+	XLogRecPtr	startStreamingAt;

-	/*
-	 * Neon WAL reader position in wait event set, or -1 if no socket.
-	 */
-	int			nwrEventPos;
-#endif
+	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
+								 * to flush pending messages */
+	XLogRecPtr	streamingAt;	/* current streaming position */
+	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */

-
-	/* WalProposer library specifics */
-#ifdef WALPROPOSER_LIB
-
-	/*
-	 * Buffer for incoming messages. Usually Rust vector is stored here.
-	 * Caller is responsible for freeing the buffer.
-	 */
-	StringInfoData inbuf;
-#endif
+	int			eventPos;		/* position in wait event set. Equal to -1 if*
+								 * no event */
+	SafekeeperState state;		/* safekeeper state machine state */
+	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
+	AcceptorGreeting greetResponse; /* acceptor greeting */
+	VoteResponse voteResponse;	/* the vote */
+	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

 /* Re-exported PostgresPollingStatusType */
@@ -410,6 +384,31 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;

+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -434,7 +433,7 @@ typedef struct walproposer_api
 	 * Get WalproposerShmemState. This is used to store information about last
 	 * elected term.
 	 */
-	WalproposerShmemState *(*get_shmem_state) (WalProposer *wp);
+	WalproposerShmemState *(*get_shmem_state) (void);

 	/*
 	 * Start receiving notifications about new WAL. This is an infinite loop
@@ -444,73 +443,68 @@ typedef struct walproposer_api
 	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);

 	/* Get pointer to the latest available WAL. */
-	XLogRecPtr	(*get_flush_rec_ptr) (WalProposer *wp);
+	XLogRecPtr	(*get_flush_rec_ptr) (void);

 	/* Get current time. */
-	TimestampTz (*get_current_timestamp) (WalProposer *wp);
+	TimestampTz (*get_current_timestamp) (void);
+
+	/* Get postgres timeline. */
+	TimeLineID	(*get_timeline_id) (void);

 	/* Current error message, aka PQerrorMessage. */
-	char	   *(*conn_error_message) (Safekeeper *sk);
+	char	   *(*conn_error_message) (WalProposerConn *conn);

 	/* Connection status, aka PQstatus. */
-	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
+	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);

 	/* Start the connection, aka PQconnectStart. */
-	void		(*conn_connect_start) (Safekeeper *sk);
+	WalProposerConn *(*conn_connect_start) (char *conninfo);

 	/* Poll an asynchronous connection, aka PQconnectPoll. */
-	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
+	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);

 	/* Send a blocking SQL query, aka PQsendQuery. */
-	bool		(*conn_send_query) (Safekeeper *sk, char *query);
+	bool		(*conn_send_query) (WalProposerConn *conn, char *query);

 	/* Read the query result, aka PQgetResult. */
-	WalProposerExecStatusType (*conn_get_query_result) (Safekeeper *sk);
+	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);

 	/* Flush buffer to the network, aka PQflush. */
-	int			(*conn_flush) (Safekeeper *sk);
+	int			(*conn_flush) (WalProposerConn *conn);

 	/* Close the connection, aka PQfinish. */
-	void		(*conn_finish) (Safekeeper *sk);
+	void		(*conn_finish) (WalProposerConn *conn);

-	/*
-	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData.
-	 *
-	 * On success, the data is placed in *buf. It is valid until the next call
-	 * to this function.
-	 */
-	PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
+	/* Try to read CopyData message, aka PQgetCopyData. */
+	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);

 	/* Try to write CopyData message, aka PQputCopyData. */
-	PGAsyncWriteResult (*conn_async_write) (Safekeeper *sk, void const *buf, size_t size);
+	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);

 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
-	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
+	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);

 	/* Download WAL from startpos to endpos and make it available locally. */
 	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);

 	/* Read WAL from disk to buf. */
-	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
+	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
-	void		(*wal_reader_allocate) (Safekeeper *sk);
+	XLogReaderState *(*wal_reader_allocate) (void);
+
+	/* Deallocate event set. */
+	void		(*free_event_set) (void);

 	/* Initialize event set. */
-	void		(*init_event_set) (WalProposer *wp);
+	void		(*init_event_set) (int n_safekeepers);

 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);

-	/* Configure wait event set for yield in SS_ACTIVE. */
-	void		(*active_state_update_event_set) (Safekeeper *sk);
-
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);

-	/* Remove safekeeper connection from event set */
-	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
-
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -519,22 +513,22 @@ typedef struct walproposer_api
 	 * events mask to indicate events and sets sk to the safekeeper which has
 	 * an event.
 	 */
-	int			(*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
+	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);

 	/* Read random bytes. */
-	bool		(*strong_random) (WalProposer *wp, void *buf, size_t len);
+	bool		(*strong_random) (void *buf, size_t len);

 	/*
 	 * Get a basebackup LSN. Used to cross-validate with the latest available
 	 * LSN on the safekeepers.
 	 */
-	XLogRecPtr	(*get_redo_start_lsn) (WalProposer *wp);
+	XLogRecPtr	(*get_redo_start_lsn) (void);

 	/*
 	 * Finish sync safekeepers with the given LSN. This function should not
 	 * return and should exit the program.
 	 */
-	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
+	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);

 	/*
 	 * Called after every new message from the safekeeper. Used to propagate
@@ -547,22 +541,7 @@ typedef struct walproposer_api
 	 * Called on peer_horizon_lsn updates. Used to advance replication slot
 	 * and to free up disk space by deleting unnecessary WAL.
 	 */
-	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
-
-	/*
-	 * Write a log message to the internal log processor. This is used only
-	 * when walproposer is compiled as a library. Otherwise, all logging is
-	 * handled by elog().
-	 */
-	void		(*log_internal) (WalProposer *wp, int level, const char *line);
-
-	/*
-	 * Called right after the proposer was elected, but before it started
-	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
-	 * Used by logical replication to update truncateLsn.
-	 */
-	void		(*after_election) (WalProposer *wp);
+	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
 } walproposer_api;

 /*
@@ -611,13 +590,6 @@ typedef struct WalProposerConfig

 	/* Will be passed to safekeepers in greet request. */
 	uint64		systemId;
-
-	/* Will be passed to safekeepers in greet request. */
-	TimeLineID	pgTimeline;
-
-#ifdef WALPROPOSER_LIB
-	void	   *callback_data;
-#endif
 } WalProposerConfig;


@@ -694,24 +666,7 @@ extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api
 extern void WalProposerStart(WalProposer *wp);
 extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
 extern void WalProposerPoll(WalProposer *wp);
-extern void WalProposerFree(WalProposer *wp);
-
-/*
- * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
- * recreate set from scratch, hence the export.
- */
-extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
-extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
-
-
-#define WPEVENT		1337		/* special log level for walproposer internal
-								 * events */
-
-#ifdef WALPROPOSER_LIB
-void		WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
-#else
-#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
-#endif
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+										   PageserverFeedback *rf);

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -1,194 +0,0 @@
-/*
- * Contains copied/adapted functions from libpq and some internal postgres functions.
- * This is needed to avoid linking to full postgres server installation. This file
- * is compiled as a part of libwalproposer static library.
- */
-
-#include <stdio.h>
-#include "walproposer.h"
-#include "utils/datetime.h"
-#include "miscadmin.h"
-
-void
-ExceptionalCondition(const char *conditionName,
-					 const char *fileName, int lineNumber)
-{
-	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
-			fileName, lineNumber, conditionName);
-	fprintf(stderr, "aborting...\n");
-	exit(1);
-}
-
-void
-pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
-{
-	if (datalen < 0 || datalen > (msg->len - msg->cursor))
-		ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
-	memcpy(buf, &msg->data[msg->cursor], datalen);
-	msg->cursor += datalen;
-}
-
-/* --------------------------------
- *		pq_getmsgint	- get a binary integer from a message buffer
- *
- *		Values are treated as unsigned.
- * --------------------------------
- */
-unsigned int
-pq_getmsgint(StringInfo msg, int b)
-{
-	unsigned int result;
-	unsigned char n8;
-	uint16		n16;
-	uint32		n32;
-
-	switch (b)
-	{
-		case 1:
-			pq_copymsgbytes(msg, (char *) &n8, 1);
-			result = n8;
-			break;
-		case 2:
-			pq_copymsgbytes(msg, (char *) &n16, 2);
-			result = pg_ntoh16(n16);
-			break;
-		case 4:
-			pq_copymsgbytes(msg, (char *) &n32, 4);
-			result = pg_ntoh32(n32);
-			break;
-		default:
-			fprintf(stderr, "unsupported integer size %d\n", b);
-			ExceptionalCondition("unsupported integer size", __FILE__, __LINE__);
-			result = 0;			/* keep compiler quiet */
-			break;
-	}
-	return result;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer
- *
- * It is tempting to merge this with pq_getmsgint, but we'd have to make the
- * result int64 for all data widths --- that could be a big performance
- * hit on machines where int64 isn't efficient.
- * --------------------------------
- */
-int64
-pq_getmsgint64(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return pg_ntoh64(n64);
-}
-
-/* --------------------------------
- *		pq_getmsgbyte	- get a raw byte from a message buffer
- * --------------------------------
- */
-int
-pq_getmsgbyte(StringInfo msg)
-{
-	if (msg->cursor >= msg->len)
-		ExceptionalCondition("no data left in message", __FILE__, __LINE__);
-	return (unsigned char) msg->data[msg->cursor++];
-}
-
-/* --------------------------------
- *		pq_getmsgbytes	- get raw data from a message buffer
- *
- *		Returns a pointer directly into the message buffer; note this
- *		may not have any particular alignment.
- * --------------------------------
- */
-const char *
-pq_getmsgbytes(StringInfo msg, int datalen)
-{
-	const char *result;
-
-	if (datalen < 0 || datalen > (msg->len - msg->cursor))
-		ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
-	result = &msg->data[msg->cursor];
-	msg->cursor += datalen;
-	return result;
-}
-
-/* --------------------------------
- *		pq_getmsgstring - get a null-terminated text string (with conversion)
- *
- *		May return a pointer directly into the message buffer, or a pointer
- *		to a palloc'd conversion result.
- * --------------------------------
- */
-const char *
-pq_getmsgstring(StringInfo msg)
-{
-	char	   *str;
-	int			slen;
-
-	str = &msg->data[msg->cursor];
-
-	/*
-	 * It's safe to use strlen() here because a StringInfo is guaranteed to
-	 * have a trailing null byte.  But check we found a null inside the
-	 * message.
-	 */
-	slen = strlen(str);
-	if (msg->cursor + slen >= msg->len)
-		ExceptionalCondition("invalid string in message", __FILE__, __LINE__);
-	msg->cursor += slen + 1;
-
-	return str;
-}
-
-/* --------------------------------
- *		pq_getmsgend	- verify message fully consumed
- * --------------------------------
- */
-void
-pq_getmsgend(StringInfo msg)
-{
-	if (msg->cursor != msg->len)
-		ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
-}
-
-
-/*
- * Produce a C-string representation of a TimestampTz.
- *
- * This is mostly for use in emitting messages.
- */
-const char *
-timestamptz_to_str(TimestampTz t)
-{
-	static char buf[MAXDATELEN + 1];
-
-	snprintf(buf, sizeof(buf), "TimestampTz(%ld)", t);
-	return buf;
-}
-
-bool
-TimestampDifferenceExceeds(TimestampTz start_time,
-						   TimestampTz stop_time,
-						   int msec)
-{
-	TimestampTz diff = stop_time - start_time;
-
-	return (diff >= msec * INT64CONST(1000));
-}
-
-void
-WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
-{
-	char		buf[1024];
-	va_list		args;
-
-	fmt = _(fmt);
-
-	va_start(args, fmt);
-	vsnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	wp->api.log_internal(wp, elevel, buf);
-}
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -43,12 +43,9 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"

-#include "libpq-fe.h"
-
-#include "libpqwalproposer.h"
 #include "neon.h"
-#include "neon_walreader.h"
 #include "walproposer.h"
+#include "libpq-fe.h"

 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
@@ -76,8 +73,7 @@ static void walprop_register_bgworker(void);
 static void walprop_pg_init_standalone_sync_safekeepers(void);
 static void walprop_pg_init_walsender(void);
 static void walprop_pg_init_bgworker(void);
-static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
-static TimeLineID walprop_pg_get_timeline_id(void);
+static TimestampTz walprop_pg_get_current_timestamp(void);
 static void walprop_pg_load_libpqwalreceiver(void);

 static process_interrupts_callback_t PrevProcessInterruptsCallback;
@@ -94,10 +90,6 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

-static void add_nwr_event_set(Safekeeper *sk, uint32 events);
-static void update_nwr_event_set(Safekeeper *sk, uint32 events);
-static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
-
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -112,7 +104,6 @@ init_walprop_config(bool syncSafekeepers)
 		walprop_config.systemId = GetSystemIdentifier();
 	else
 		walprop_config.systemId = 0;
-	walprop_config.pgTimeline = walprop_pg_get_timeline_id();
 }

 /*
@@ -145,7 +136,7 @@ WalProposerMain(Datum main_arg)
 	walprop_pg_load_libpqwalreceiver();

 	wp = WalProposerCreate(&walprop_config, walprop_pg);
-	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);
+	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp();

 	walprop_pg_init_walsender();
 	WalProposerStart(wp);
@@ -388,7 +379,7 @@ nwp_shmem_startup_hook(void)
 }

 static WalproposerShmemState *
-walprop_pg_get_shmem_state(WalProposer *wp)
+walprop_pg_get_shmem_state(void)
 {
 	Assert(walprop_shared != NULL);
 	return walprop_shared;
@@ -514,7 +505,7 @@ walprop_pg_init_bgworker(void)
 }

 static XLogRecPtr
-walprop_pg_get_flush_rec_ptr(WalProposer *wp)
+walprop_pg_get_flush_rec_ptr(void)
 {
 #if PG_MAJORVERSION_NUM < 15
 	return GetFlushRecPtr();
@@ -524,7 +515,7 @@ walprop_pg_get_flush_rec_ptr(WalProposer *wp)
 }

 static TimestampTz
-walprop_pg_get_current_timestamp(WalProposer *wp)
+walprop_pg_get_current_timestamp(void)
 {
 	return GetCurrentTimestamp();
 }
@@ -548,6 +539,14 @@ walprop_pg_load_libpqwalreceiver(void)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }

+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received data from walprop_async_read */
+};
+
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -566,15 +565,15 @@ ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)

 /* Exported function definitions */
 static char *
-walprop_error_message(Safekeeper *sk)
+walprop_error_message(WalProposerConn *conn)
 {
-	return PQerrorMessage(sk->conn->pg_conn);
+	return PQerrorMessage(conn->pg_conn);
 }

 static WalProposerConnStatusType
-walprop_status(Safekeeper *sk)
+walprop_status(WalProposerConn *conn)
 {
-	switch (PQstatus(sk->conn->pg_conn))
+	switch (PQstatus(conn->pg_conn))
 	{
 		case CONNECTION_OK:
 			return WP_CONNECTION_OK;
@@ -585,18 +584,16 @@ walprop_status(Safekeeper *sk)
 	}
 }

-WalProposerConn *
-libpqwp_connect_start(char *conninfo)
+static WalProposerConn *
+walprop_connect_start(char *conninfo)
 {
-
-	PGconn	   *pg_conn;
 	WalProposerConn *conn;
+	PGconn	   *pg_conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;

-
 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
 	 * environment variable was set, use that as the password.
@@ -622,11 +619,11 @@ libpqwp_connect_start(char *conninfo)
 	pg_conn = PQconnectStartParams(keywords, values, 1);

 	/*
-	 * "If the result is null, then libpq has been unable to allocate a new
-	 * PGconn structure"
+	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
+	 * replicate the behavior of PQconnectStart here.
 	 */
 	if (!pg_conn)
-		elog(FATAL, "failed to allocate new PGconn object");
+		return NULL;

 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -643,20 +640,12 @@ libpqwp_connect_start(char *conninfo)
 	return conn;
 }

-static void
-walprop_connect_start(Safekeeper *sk)
-{
-	Assert(sk->conn == NULL);
-	sk->conn = libpqwp_connect_start(sk->conninfo);
-
-}
-
 static WalProposerConnectPollStatusType
-walprop_connect_poll(Safekeeper *sk)
+walprop_connect_poll(WalProposerConn *conn)
 {
 	WalProposerConnectPollStatusType return_val;

-	switch (PQconnectPoll(sk->conn->pg_conn))
+	switch (PQconnectPoll(conn->pg_conn))
 	{
 		case PGRES_POLLING_FAILED:
 			return_val = WP_CONN_POLLING_FAILED;
@@ -692,8 +681,8 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }

-extern bool
-libpqwp_send_query(WalProposerConn *conn, char *query)
+static bool
+walprop_send_query(WalProposerConn *conn, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
@@ -709,16 +698,9 @@ libpqwp_send_query(WalProposerConn *conn, char *query)
 	return true;
 }

-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+static WalProposerExecStatusType
+walprop_get_query_result(WalProposerConn *conn)
 {
-	return libpqwp_send_query(sk->conn, query);
-}
-
-WalProposerExecStatusType
-libpqwp_get_query_result(WalProposerConn *conn)
-{
-
 	PGresult   *result;
 	WalProposerExecStatusType return_val;

@@ -794,29 +776,36 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	return return_val;
 }

-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
-{
-	return libpqwp_get_query_result(sk->conn);
-}
-
 static pgsocket
-walprop_socket(Safekeeper *sk)
+walprop_socket(WalProposerConn *conn)
 {
-	return PQsocket(sk->conn->pg_conn);
+	return PQsocket(conn->pg_conn);
 }

 static int
-walprop_flush(Safekeeper *sk)
+walprop_flush(WalProposerConn *conn)
 {
-	return (PQflush(sk->conn->pg_conn));
+	return (PQflush(conn->pg_conn));
 }

-/* Like libpqrcv_receive. *buf is valid until the next call. */
-PGAsyncReadResult
-libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
+static void
+walprop_finish(WalProposerConn *conn)
 {
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}

+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
+{
 	int			result;

 	if (conn->recvbuf != NULL)
@@ -885,25 +874,13 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 	}
 }

-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
-	return libpqwp_async_read(sk->conn, buf, amount);
-}
-
 static PGAsyncWriteResult
-walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
+walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 {
 	int			result;

 	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(sk->conn, true))
+	if (!ensure_nonblocking_status(conn, true))
 		return PG_ASYNC_WRITE_FAIL;

 	/*
@@ -911,7 +888,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
 	 * error occurred
 	 */
-	result = PQputCopyData(sk->conn->pg_conn, buf, size);
+	result = PQputCopyData(conn->pg_conn, buf, size);

 	/*
 	 * We won't get a result of zero because walproposer always empties the
@@ -939,7 +916,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 	 * sucessful, 1 if it was unable to send all the data in the send queue
 	 * yet -1 if it failed for some reason
 	 */
-	switch (result = PQflush(sk->conn->pg_conn))
+	switch (result = PQflush(conn->pg_conn))
 	{
 		case 0:
 			return PG_ASYNC_WRITE_SUCCESS;
@@ -957,54 +934,28 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 * information, refer to the comments there.
 */
 static bool
-walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
+walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
 {
 	int			result;

 	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(sk->conn, false))
+	if (!ensure_nonblocking_status(conn, false))
 		return false;

-	if ((result = PQputCopyData(sk->conn->pg_conn, buf, size)) == -1)
+	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
 		return false;

 	Assert(result == 1);

 	/* Because the connection is non-blocking, flushing returns 0 or -1 */

-	if ((result = PQflush(sk->conn->pg_conn)) == -1)
+	if ((result = PQflush(conn->pg_conn)) == -1)
 		return false;

 	Assert(result == 0);
 	return true;
 }

-void
-libpqwp_disconnect(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-static void
-walprop_finish(Safekeeper *sk)
-{
-	if (sk->conn)
-	{
-		libpqwp_disconnect(sk->conn);
-		sk->conn = NULL;
-	}
-
-	/* free xlogreader */
-	if (sk->xlogreader)
-	{
-		NeonWALReaderFree(sk->xlogreader);
-		sk->xlogreader = NULL;
-	}
-}
-
 /*
 * Subscribe for new WAL and stream it in the loop to safekeepers.
 *
@@ -1429,98 +1380,51 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }

-static NeonWALReadResult
-walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
+static void
+walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count)
 {
-	NeonWALReadResult res;
+	WALReadError errinfo;

-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
-
-	if (res == NEON_WALREAD_SUCCESS)
+	if (!WALRead(state,
+				 buf,
+				 startptr,
+				 count,
+				 walprop_pg_get_timeline_id(),
+				 &errinfo))
 	{
-		/*
-		 * If we have the socket subscribed, but walreader doesn't need any
-		 * events, it must mean that remote connection just closed hoping to
-		 * do next read locally. Remove the socket then. It is important to do
-		 * as otherwise next read might open another connection and we won't
-		 * be able to distinguish whether we have correct socket added in wait
-		 * event set.
-		 */
-		if (NeonWALReaderEvents(sk->xlogreader) == 0)
-			rm_safekeeper_event_set(sk, false);
+		WALReadRaiseError(&errinfo);
 	}
-
-	return res;
 }

-static void
-walprop_pg_wal_reader_allocate(Safekeeper *sk)
+static XLogReaderState *
+walprop_pg_wal_reader_allocate(void)
 {
-	char		log_prefix[64];
-
-	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
-	if (sk->xlogreader == NULL)
-		elog(FATAL, "Failed to allocate xlog reader");
+	return XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
 }

 static WaitEventSet *waitEvents;

 static void
-walprop_pg_free_event_set(WalProposer *wp)
+walprop_pg_free_event_set(void)
 {
 	if (waitEvents)
 	{
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
-	}
 }

 static void
-walprop_pg_init_event_set(WalProposer *wp)
+walprop_pg_init_event_set(int n_safekeepers)
 {
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");

-	/* for each sk, we have socket plus potentially socket for neon walreader */
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
-	}
-}
-
-/* add safekeeper socket to wait event set */
-static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->eventPos == -1);
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
-}
-
-/* add neon wal reader socket to wait event set */
-static void
-add_nwr_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->nwrEventPos == -1);
-	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
-	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }

 static void
@@ -1532,143 +1436,14 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }

-/*
- * Update neon_walreader event.
- * Can be called when nwr socket doesn't exist, does nothing in this case.
- */
 static void
-update_nwr_event_set(Safekeeper *sk, uint32 events)
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
 {
-	/* eventPos = -1 when we don't have an event */
-	if (sk->nwrEventPos != -1)
-		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
-}
-
-
-static void
-walprop_pg_active_state_update_event_set(Safekeeper *sk)
-{
-	uint32		sk_events;
-	uint32		nwr_events;
-
-	Assert(sk->state == SS_ACTIVE);
-	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-	/*
-	 * If we need to wait for neon_walreader, ensure we have up to date socket
-	 * in the wait event set.
-	 */
-	if (sk->active_state == SS_ACTIVE_READ_WAL)
-	{
-		/*
-		 * TODO: instead of reattaching socket (and thus recreating WES) each
-		 * time we should keep it if possible, i.e. if connection is already
-		 * established. Note that single neon_walreader object can switch
-		 * between local and remote reads multiple times during its lifetime,
-		 * so careful bookkeeping is needed here.
-		 */
-		rm_safekeeper_event_set(sk, false);
-		add_nwr_event_set(sk, nwr_events);
-	}
-	else
-	{
-		/*
-		 * Hack: we should always set 0 here, but for random reasons
-		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
-		 * some event. Since there is also no way to remove socket except
-		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
-		 * gives WL_SOCKET_CLOSED if socket exists.
-		 */
-		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
-		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
-	}
-	walprop_pg_update_event_set(sk, sk_events);
-}
-
-static void
-walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
-{
-	rm_safekeeper_event_set(to_remove, true);
-}
-
-/*
- * A hacky way to remove single event from the event set. Can be called if event
- * doesn't exist, does nothing in this case.
- *
- * Note: Internally, this completely reconstructs the event set. It should be
- * avoided if possible.
- *
- * If is_sk is true, socket of connection to safekeeper is removed; otherwise
- * socket of neon_walreader.
- */
-static void
-rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
-{
-	WalProposer *wp = to_remove->wp;
-
-	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-		 to_remove->host, to_remove->port, is_sk);
-
-	/*
-	 * Shortpath for exiting if have nothing to do. We never call this
-	 * function with safekeeper socket not existing, but do that with neon
-	 * walreader socket.
-	 */
-	if ((is_sk && to_remove->eventPos == -1) ||
-		(!is_sk && to_remove->nwrEventPos == -1))
-	{
-		return;
-	}
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	walprop_pg_free_event_set(wp);
-
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk == to_remove)
-		{
-			if (is_sk)
-				sk->eventPos = -1;
-			else
-				sk->nwrEventPos = -1;
-		}
-
-		/*
-		 * If this safekeeper isn't offline, add events for it, except for the
-		 * event requested to remove.
-		 */
-		if (sk->state != SS_OFFLINE)
-		{
-			uint32		sk_events;
-			uint32		nwr_events;
-
-			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-			if (sk != to_remove || !is_sk)
-			{
-				/* will set sk->eventPos */
-				wp->api.add_safekeeper_event_set(sk, sk_events);
-			}
-			else if ((sk != to_remove || is_sk) && nwr_events)
-			{
-				add_nwr_event_set(sk, nwr_events);
-			}
-		}
-	}
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk->conn), NULL, sk);
 }

 static int
-walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events)
+walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
 {
 	WaitEvent	event = {0};
 	int			rc = 0;
@@ -1724,7 +1499,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 }

 static void
-walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
+walprop_pg_finish_sync_safekeepers(XLogRecPtr lsn)
 {
 	fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn));
 	exit(0);
@@ -1836,7 +1611,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 			 * pageserver.
 			 */
 								quorumFeedback.rf.disk_consistent_lsn,
-								walprop_pg_get_current_timestamp(wp), false);
+								walprop_pg_get_current_timestamp(), false);
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
@@ -1853,69 +1628,18 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 }

 static void
-walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
+walprop_pg_confirm_wal_streamed(XLogRecPtr lsn)
 {
 	if (MyReplicationSlot)
 		PhysicalConfirmReceivedLocation(lsn);
 }

-static XLogRecPtr
-walprop_pg_get_redo_start_lsn(WalProposer *wp)
-{
-	return GetRedoStartLsn();
-}
-
-static bool
-walprop_pg_strong_random(WalProposer *wp, void *buf, size_t len)
-{
-	return pg_strong_random(buf, len);
-}
-
-static void
-walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
-{
-	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
-}
-
-static void
-walprop_pg_after_election(WalProposer *wp)
-{
-	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
-
-	/* We don't need to do anything in syncSafekeepers mode. */
-	if (wp->config->syncSafekeepers)
-		return;
-
-	/*
-	 * If there are active logical replication subscription we need to provide
-	 * enough WAL for their WAL senders based on th position of their
-	 * replication slots.
-	 */
-	f = fopen("restart.lsn", "rb");
-	if (f != NULL && !wp->config->syncSafekeepers)
-	{
-		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
-		fclose(f);
-		if (lrRestartLsn != InvalidXLogRecPtr)
-		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
-		}
-	}
-}
-
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
 	.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
 	.get_current_timestamp = walprop_pg_get_current_timestamp,
+	.get_timeline_id = walprop_pg_get_timeline_id,
 	.conn_error_message = walprop_error_message,
 	.conn_status = walprop_status,
 	.conn_connect_start = walprop_connect_start,
@@ -1930,17 +1654,14 @@ static const walproposer_api walprop_pg = {
 	.recovery_download = WalProposerRecovery,
 	.wal_read = walprop_pg_wal_read,
 	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
+	.free_event_set = walprop_pg_free_event_set,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
-	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
-	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
-	.strong_random = walprop_pg_strong_random,
-	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
+	.strong_random = pg_strong_random,
+	.get_redo_start_lsn = GetRedoStartLsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
 	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
-	.log_internal = walprop_pg_log_internal,
-	.after_election = walprop_pg_after_election,
 };
--- a/poetry.lock
+++ b/poetry.lock
@@ -2415,13 +2415,13 @@ files = [

 [[package]]
 name = "urllib3"
-version = "1.26.18"
+version = "1.26.17"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
-    {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"},
-    {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"},
+    {file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"},
+    {file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"},
 ]

 [package.extras]
@@ -2488,16 +2488,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -83,10 +83,6 @@ struct ProxyCliArgs {
    /// timeout for http connections
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    sql_over_http_timeout: tokio::time::Duration,
-
-    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    require_client_ip: bool,
 }

 #[tokio::main]
@@ -237,7 +233,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
-        require_client_ip: args.require_client_ip,
    }));

    Ok(config)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -14,7 +14,6 @@ pub struct ProxyConfig {
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
-    pub require_client_ip: bool,
 }

 #[derive(Debug)]
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -8,17 +8,14 @@ use pbkdf2::{
    Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
+use std::sync::atomic::{self, AtomicUsize};
 use std::{collections::HashMap, sync::Arc};
 use std::{
    fmt,
    task::{ready, Poll},
 };
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio::time;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
+use tokio_postgres::AsyncMessage;

 use crate::{
    auth, console,
@@ -29,13 +26,13 @@ use crate::{compute, config};

 use crate::proxy::ConnectMechanism;

-use tracing::{error, warn, Span};
+use tracing::{error, warn};
 use tracing::{info, info_span, Instrument};

 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;

-#[derive(Debug, Clone)]
+#[derive(Debug)]
 pub struct ConnInfo {
    pub username: String,
    pub dbname: String,
@@ -58,7 +55,7 @@ impl fmt::Display for ConnInfo {
 }

 struct ConnPoolEntry {
-    conn: ClientInner,
+    conn: Client,
    _last_access: std::time::Instant,
 }

@@ -136,20 +133,14 @@ impl GlobalConnPool {
    }

    pub async fn get(
-        self: &Arc<Self>,
+        &self,
        conn_info: &ConnInfo,
        force_new: bool,
        session_id: uuid::Uuid,
    ) -> anyhow::Result<Client> {
-        let mut client: Option<ClientInner> = None;
+        let mut client: Option<Client> = None;
        let mut latency_timer = LatencyTimer::new("http");

-        let pool = if force_new {
-            None
-        } else {
-            Some((conn_info.clone(), self.clone()))
-        };
-
        let mut hash_valid = false;
        if !force_new {
            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
@@ -197,11 +188,7 @@ impl GlobalConnPool {
                latency_timer.pool_hit();
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
-                return Ok(Client {
-                    inner: Some(client),
-                    span: Span::current(),
-                    pool,
-                });
+                return Ok(client);
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
@@ -241,14 +228,10 @@ impl GlobalConnPool {
            _ => {}
        }

-        new_client.map(|inner| Client {
-            inner: Some(inner),
-            span: Span::current(),
-            pool,
-        })
+        new_client
    }

-    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+    pub fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
        // We want to hold this open while we return. This ensures that the pool can't close
        // while we are in the middle of returning the connection.
        let closed = self.closed.read();
@@ -343,7 +326,7 @@ struct TokioMechanism<'a> {

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = ClientInner;
+    type Connection = Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -367,7 +350,7 @@ async fn connect_to_compute(
    conn_info: &ConnInfo,
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
-) -> anyhow::Result<ClientInner> {
+) -> anyhow::Result<Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -416,7 +399,7 @@ async fn connect_to_compute_once(
    conn_info: &ConnInfo,
    timeout: time::Duration,
    mut session: uuid::Uuid,
-) -> Result<ClientInner, tokio_postgres::Error> {
+) -> Result<Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

    let (client, mut connection) = config
@@ -479,99 +462,21 @@ async fn connect_to_compute_once(
        .instrument(span)
    );

-    Ok(ClientInner {
+    Ok(Client {
        inner: client,
        session: tx,
        ids,
    })
 }

-struct ClientInner {
-    inner: tokio_postgres::Client,
+pub struct Client {
+    pub inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
    ids: Ids,
 }

 impl Client {
    pub fn metrics(&self) -> Arc<MetricCounter> {
-        USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
-    }
-}
-
-pub struct Client {
-    span: Span,
-    inner: Option<ClientInner>,
-    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
-}
-
-pub struct Discard<'a> {
-    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
-}
-
-impl Client {
-    pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
-        let Self {
-            inner,
-            pool,
-            span: _,
-        } = self;
-        (
-            &mut inner
-                .as_mut()
-                .expect("client inner should not be removed")
-                .inner,
-            Discard { pool },
-        )
-    }
-
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        self.inner().1.check_idle(status)
-    }
-    pub fn discard(&mut self) {
-        self.inner().1.discard()
-    }
-}
-
-impl Discard<'_> {
-    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        if status != ReadyForQueryStatus::Idle {
-            if let Some((conn_info, _)) = self.pool.take() {
-                info!("pool: throwing away connection '{conn_info}' because connection is not idle")
-            }
-        }
-    }
-    pub fn discard(&mut self) {
-        if let Some((conn_info, _)) = self.pool.take() {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
-        }
-    }
-}
-
-impl Deref for Client {
-    type Target = tokio_postgres::Client;
-
-    fn deref(&self) -> &Self::Target {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-}
-
-impl Drop for Client {
-    fn drop(&mut self) {
-        let client = self
-            .inner
-            .take()
-            .expect("client inner should not be removed");
-        if let Some((conn_info, conn_pool)) = self.pool.take() {
-            let current_span = self.span.clone();
-            // return connection to the pool
-            tokio::task::spawn_blocking(move || {
-                let _span = current_span.enter();
-                let _ = conn_pool.put(&conn_info, client);
-            });
-        }
+        USAGE_METRICS.register(self.ids.clone())
    }
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -17,9 +17,7 @@ use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
-use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Row;
-use tokio_postgres::Transaction;
 use tracing::error;
 use tracing::instrument;
 use url::Url;
@@ -66,18 +64,20 @@ static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
    json.iter()
        .map(|value| {
            match value {
                // special care for nulls
-                Value::Null => None,
+                Value::Null => Ok(None),

                // convert to text with escaping
-                v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
+                Value::Bool(_) => serde_json::to_string(value).map(Some),
+                Value::Number(_) => serde_json::to_string(value).map(Some),
+                Value::Object(_) => serde_json::to_string(value).map(Some),

                // avoid escaping here, as we pass this as a parameter
-                Value::String(s) => Some(s.to_string()),
+                Value::String(s) => Ok(Some(s.to_string())),

                // special care for arrays
                Value::Array(_) => json_array_to_pg_array(value),
@@ -94,26 +94,29 @@ fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
 //
 // Example of the same escaping in node-postgres: packages/pg/lib/utils.js
 //
-fn json_array_to_pg_array(value: &Value) -> Option<String> {
+fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
    match value {
        // special care for nulls
-        Value::Null => None,
+        Value::Null => Ok(None),

        // convert to text with escaping
+        Value::Bool(_) => serde_json::to_string(value).map(Some),
+        Value::Number(_) => serde_json::to_string(value).map(Some),
+
        // here string needs to be escaped, as it is part of the array
-        v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
-        v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
+        Value::Object(_) => json_array_to_pg_array(&Value::String(serde_json::to_string(value)?)),
+        Value::String(_) => serde_json::to_string(value).map(Some),

        // recurse into array
        Value::Array(arr) => {
            let vals = arr
                .iter()
                .map(json_array_to_pg_array)
-                .map(|v| v.unwrap_or_else(|| "NULL".to_string()))
-                .collect::<Vec<_>>()
+                .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
+                .collect::<Result<Vec<_>, _>>()?
                .join(",");

-            Some(format!("{{{}}}", vals))
+            Ok(Some(format!("{{{}}}", vals)))
        }
    }
 }
@@ -312,119 +315,83 @@ async fn handle_inner(
    // Now execute the query and return the result
    //
    let mut size = 0;
-    let result =
-        match payload {
-            Payload::Single(stmt) => {
-                let (status, results) =
-                    query_to_json(&*client, stmt, &mut 0, raw_output, array_mode)
-                        .await
-                        .map_err(|e| {
-                            client.discard();
-                            e
-                        })?;
-                client.check_idle(status);
-                results
+    let result = match payload {
+        Payload::Single(query) => {
+            query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
+        }
+        Payload::Batch(batch_query) => {
+            let mut results = Vec::new();
+            let mut builder = client.inner.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
            }
-            Payload::Batch(statements) => {
-                let (inner, mut discard) = client.inner();
-                let mut builder = inner.build_transaction();
-                if let Some(isolation_level) = txn_isolation_level {
-                    builder = builder.isolation_level(isolation_level);
-                }
-                if txn_read_only {
-                    builder = builder.read_only(true);
-                }
-                if txn_deferrable {
-                    builder = builder.deferrable(true);
-                }
-
-                let transaction = builder.start().await.map_err(|e| {
-                    // if we cannot start a transaction, we should return immediately
-                    // and not return to the pool. connection is clearly broken
-                    discard.discard();
-                    e
-                })?;
-
-                let results =
-                    match query_batch(&transaction, statements, &mut size, raw_output, array_mode)
-                        .await
-                    {
-                        Ok(results) => {
-                            let status = transaction.commit().await.map_err(|e| {
-                                // if we cannot commit - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            results
-                        }
-                        Err(err) => {
-                            let status = transaction.rollback().await.map_err(|e| {
-                                // if we cannot rollback - for now don't return connection to pool
-                                // TODO: get a query status from the error
-                                discard.discard();
-                                e
-                            })?;
-                            discard.check_idle(status);
-                            return Err(err);
-                        }
-                    };
-
-                if txn_read_only {
-                    response = response.header(
-                        TXN_READ_ONLY.clone(),
-                        HeaderValue::try_from(txn_read_only.to_string())?,
-                    );
-                }
-                if txn_deferrable {
-                    response = response.header(
-                        TXN_DEFERRABLE.clone(),
-                        HeaderValue::try_from(txn_deferrable.to_string())?,
-                    );
-                }
-                if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                    response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-                }
-                json!({ "results": results })
+            if txn_read_only {
+                builder = builder.read_only(true);
            }
-        };
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
+            let transaction = builder.start().await?;
+            for query in batch_query.queries {
+                let result =
+                    query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
+                match result {
+                    Ok(r) => results.push(r),
+                    Err(e) => {
+                        transaction.rollback().await?;
+                        return Err(e);
+                    }
+                }
+            }
+            transaction.commit().await?;
+            if txn_read_only {
+                response = response.header(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                response = response.header(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
+            }
+            Ok(json!({ "results": results }))
+        }
+    };

    let metrics = client.metrics();

-    // how could this possibly fail
-    let body = serde_json::to_string(&result).expect("json serialization should not fail");
-    let len = body.len();
-    let response = response
-        .body(Body::from(body))
-        // only fails if invalid status code or invalid header/values are given.
-        // these are not user configurable so it cannot fail dynamically
-        .expect("building response payload should not fail");
-
-    // count the egress bytes - we miss the TLS and header overhead but oh well...
-    // moving this later in the stack is going to be a lot of effort and ehhhh
-    metrics.record_egress(len as u64);
-
-    Ok(response)
-}
-
-async fn query_batch(
-    transaction: &Transaction<'_>,
-    queries: BatchQueryData,
-    total_size: &mut usize,
-    raw_output: bool,
-    array_mode: bool,
-) -> anyhow::Result<Vec<Value>> {
-    let mut results = Vec::with_capacity(queries.queries.len());
-    let mut current_size = 0;
-    for stmt in queries.queries {
-        // TODO: maybe we should check that the transaction bit is set here
-        let (_, values) =
-            query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
-        results.push(values);
+    if allow_pool {
+        let current_span = tracing::Span::current();
+        // return connection to the pool
+        tokio::task::spawn_blocking(move || {
+            let _span = current_span.enter();
+            let _ = conn_pool.put(&conn_info, client);
+        });
+    }
+
+    match result {
+        Ok(value) => {
+            // how could this possibly fail
+            let body = serde_json::to_string(&value).expect("json serialization should not fail");
+            let len = body.len();
+            let response = response
+                .body(Body::from(body))
+                // only fails if invalid status code or invalid header/values are given.
+                // these are not user configurable so it cannot fail dynamically
+                .expect("building response payload should not fail");
+
+            // count the egress bytes - we miss the TLS and header overhead but oh well...
+            // moving this later in the stack is going to be a lot of effort and ehhhh
+            metrics.record_egress(len as u64);
+            Ok(response)
+        }
+        Err(e) => Err(e),
    }
-    *total_size += current_size;
-    Ok(results)
 }

 async fn query_to_json<T: GenericClient>(
@@ -433,9 +400,11 @@ async fn query_to_json<T: GenericClient>(
    current_size: &mut usize,
    raw_output: bool,
    array_mode: bool,
-) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
-    let query_params = json_to_pg_text(data.params);
-    let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+) -> anyhow::Result<Value> {
+    let query_params = json_to_pg_text(data.params)?;
+    let row_stream = client
+        .query_raw_txt::<String, _>(data.query, query_params)
+        .await?;

    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
@@ -455,8 +424,6 @@ async fn query_to_json<T: GenericClient>(
        }
    }

-    let ready = row_stream.ready_status();
-
    // grab the command tag and number of rows affected
    let command_tag = row_stream.command_tag().unwrap_or_default();
    let mut command_tag_split = command_tag.split(' ');
@@ -497,16 +464,13 @@ async fn query_to_json<T: GenericClient>(
        .collect::<Result<Vec<_>, _>>()?;

    // resulting JSON format is based on the format of node-postgres result
-    Ok((
-        ready,
-        json!({
-            "command": command_tag_name,
-            "rowCount": command_tag_count,
-            "rows": rows,
-            "fields": fields,
-            "rowAsArray": array_mode,
-        }),
-    ))
+    Ok(json!({
+        "command": command_tag_name,
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+        "rowAsArray": array_mode,
+    }))
 }

 //
@@ -691,22 +655,22 @@ mod tests {
    #[test]
    fn test_atomic_types_to_pg_params() {
        let json = vec![Value::Bool(true), Value::Bool(false)];
-        let pg_params = json_to_pg_text(json);
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(
            pg_params,
            vec![Some("true".to_owned()), Some("false".to_owned())]
        );

        let json = vec![Value::Number(serde_json::Number::from(42))];
-        let pg_params = json_to_pg_text(json);
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![Some("42".to_owned())]);

        let json = vec![Value::String("foo\"".to_string())];
-        let pg_params = json_to_pg_text(json);
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);

        let json = vec![Value::Null];
-        let pg_params = json_to_pg_text(json);
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![None]);
    }

@@ -715,7 +679,7 @@ mod tests {
        // atoms and escaping
        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
            vec![Some(
@@ -726,7 +690,7 @@ mod tests {
        // nested arrays
        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
            vec![Some(
@@ -736,7 +700,7 @@ mod tests {
        // array of objects
        let json = r#"[{"foo": 1},{"bar": 2}]"#;
        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(vec![json]);
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
            vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -8,7 +8,6 @@ use crate::{
        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
    },
 };
-use anyhow::bail;
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
 use hyper::{
@@ -23,6 +22,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;

 use std::{
+    convert::Infallible,
    future::ready,
    pin::Pin,
    sync::Arc,
@@ -280,18 +280,12 @@ pub async fn task_main(
    let make_svc = hyper::service::make_service_fn(
        |stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
            let (io, tls) = stream.get_ref();
-            let client_addr = io.client_addr();
-            let remote_addr = io.inner.remote_addr();
+            let peer_addr = io.client_addr().unwrap_or(io.inner.remote_addr());
            let sni_name = tls.server_name().map(|s| s.to_string());
            let conn_pool = conn_pool.clone();

            async move {
-                let peer_addr = match client_addr {
-                    Some(addr) => addr,
-                    None if config.require_client_ip => bail!("missing required client ip"),
-                    None => remote_addr,
-                };
-                Ok(MetricService::new(hyper::service::service_fn(
+                Ok::<_, Infallible>(MetricService::new(hyper::service::service_fn(
                    move |req: Request<Body>| {
                        let sni_name = sni_name.clone();
                        let conn_pool = conn_pool.clone();
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -200,8 +200,6 @@ pub async fn task_main(
                        let mut socket = WithClientIp::new(socket);
                        if let Some(ip) = socket.wait_for_addr().await? {
                            tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
-                        } else if config.require_client_ip {
-                            bail!("missing required client IP");
                        }

                        socket
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -31,7 +31,7 @@ impl<'a> FirstMessage<'a> {

 /// A single SASL message.
 /// This struct is deliberately decoupled from lower-level
-/// [`BeAuthenticationSaslMessage`].
+/// [`BeAuthenticationSaslMessage`](pq_proto::BeAuthenticationSaslMessage).
 #[derive(Debug)]
 pub(super) enum ServerMessage<T> {
    /// We expect to see more steps.
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.73.0"
+channel = "1.72.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,7 +3,7 @@
 //
 use anyhow::{bail, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::{ArgAction, Parser};
+use clap::Parser;
 use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
@@ -105,9 +105,6 @@ struct Args {
    /// it during this period passed as a human readable duration.
    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
    heartbeat_timeout: Duration,
-    /// Enable/disable peer recovery.
-    #[arg(long, default_value = "false", action=ArgAction::Set)]
-    peer_recovery: bool,
    /// Remote storage configuration for WAL backup (offloading to s3) as TOML
    /// inline table, e.g.
    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
@@ -268,7 +265,6 @@ async fn main() -> anyhow::Result<()> {
        broker_endpoint: args.broker_endpoint,
        broker_keepalive_interval: args.broker_keepalive_interval,
        heartbeat_timeout: args.heartbeat_timeout,
-        peer_recovery_enabled: args.peer_recovery,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -372,13 +372,6 @@ impl SafekeeperPostgresHandler {
    /// from a walproposer recovery function. This connection gets a special handling:
    /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
    pub fn is_walproposer_recovery(&self) -> bool {
-        match &self.appname {
-            None => false,
-            Some(appname) => {
-                appname == "wal_proposer_recovery" ||
-                // set by safekeeper peer recovery
-                appname.starts_with("safekeeper")
-            }
-        }
+        self.appname == Some("wal_proposer_recovery".to_string())
    }
 }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -16,8 +16,8 @@ use tokio::io::AsyncReadExt;
 use utils::http::endpoint::request_span;

 use crate::receive_wal::WalReceiverState;
+use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
-use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
 use crate::{debug_dump, pull_timeline};
@@ -60,25 +60,16 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
        .as_ref()
 }

-/// Same as TermLsn, but serializes LSN using display serializer
+/// Same as TermSwitchEntry, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
 #[serde_as]
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct TermSwitchApiEntry {
    pub term: Term,
    #[serde_as(as = "DisplayFromStr")]
    pub lsn: Lsn,
 }

-impl From<TermSwitchApiEntry> for TermLsn {
-    fn from(api_val: TermSwitchApiEntry) -> Self {
-        TermLsn {
-            term: api_val.term,
-            lsn: api_val.lsn,
-        }
-    }
-}
-
 /// Augment AcceptorState with epoch for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -62,7 +62,6 @@ pub struct SafeKeeperConf {
    pub broker_endpoint: Uri,
    pub broker_keepalive_interval: Duration,
    pub heartbeat_timeout: Duration,
-    pub peer_recovery_enabled: bool,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub max_offloader_lag_bytes: u64,
    pub backup_parallel_jobs: usize,
@@ -101,7 +100,6 @@ impl SafeKeeperConf {
                .parse()
                .expect("failed to parse default broker endpoint"),
            broker_keepalive_interval: Duration::from_secs(5),
-            peer_recovery_enabled: true,
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
            pg_auth: None,
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -55,12 +55,9 @@ impl WalReceivers {

    /// Register new walreceiver. Returned guard provides access to the slot and
    /// automatically deregisters in Drop.
-    pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
+    pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
        let slots = &mut self.mutex.lock().slots;
-        let walreceiver = WalReceiverState {
-            conn_id,
-            status: WalReceiverStatus::Voting,
-        };
+        let walreceiver = WalReceiverState::Voting;
        // find empty slot or create new one
        let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
            slots[pos] = Some(walreceiver);
@@ -99,18 +96,6 @@ impl WalReceivers {
        self.mutex.lock().slots.iter().flatten().cloned().collect()
    }

-    /// Get number of streaming walreceivers (normally 0 or 1) from compute.
-    pub fn get_num_streaming(self: &Arc<WalReceivers>) -> usize {
-        self.mutex
-            .lock()
-            .slots
-            .iter()
-            .flatten()
-            // conn_id.is_none skips recovery which also registers here
-            .filter(|s| s.conn_id.is_some() && matches!(s.status, WalReceiverStatus::Streaming))
-            .count()
-    }
-
    /// Unregister walsender.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
@@ -123,17 +108,10 @@ struct WalReceiversShared {
    slots: Vec<Option<WalReceiverState>>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalReceiverState {
-    /// None means it is recovery initiated by us (this safekeeper).
-    pub conn_id: Option<ConnectionId>,
-    pub status: WalReceiverStatus,
-}
-
 /// Walreceiver status. Currently only whether it passed voting stage and
 /// started receiving the stream, but it is easy to add more if needed.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum WalReceiverStatus {
+pub enum WalReceiverState {
    Voting,
    Streaming,
 }
@@ -158,8 +136,8 @@ impl Drop for WalReceiverGuard {
    }
 }

-pub const MSG_QUEUE_SIZE: usize = 256;
-pub const REPLY_QUEUE_SIZE: usize = 16;
+const MSG_QUEUE_SIZE: usize = 256;
+const REPLY_QUEUE_SIZE: usize = 16;

 impl SafekeeperPostgresHandler {
    /// Wrapper around handle_start_wal_push_guts handling result. Error is
@@ -283,7 +261,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
            tli.clone(),
            msg_rx,
            reply_tx,
-            Some(self.conn_id),
+            self.conn_id,
        ));

        // Forward all messages to WalAcceptor
@@ -339,41 +317,31 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 // even when it writes a steady stream of messages.
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);

-/// Encapsulates a task which takes messages from msg_rx, processes and pushes
-/// replies to reply_tx; reading from socket and writing to disk in parallel is
-/// beneficial for performance, this struct provides writing to disk part.
-pub struct WalAcceptor {
+/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
+struct WalAcceptor {
    tli: Arc<Timeline>,
    msg_rx: Receiver<ProposerAcceptorMessage>,
    reply_tx: Sender<AcceptorProposerMessage>,
-    conn_id: Option<ConnectionId>,
 }

 impl WalAcceptor {
-    /// Spawn task with WalAcceptor running, return handle to it. Task returns
-    /// Ok(()) if either of channels has closed, and Err if any error during
-    /// message processing is encountered.
-    ///
-    /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
-    pub fn spawn(
+    /// Spawn thread with WalAcceptor running, return handle to it.
+    fn spawn(
        tli: Arc<Timeline>,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
-        conn_id: Option<ConnectionId>,
+        conn_id: ConnectionId,
    ) -> JoinHandle<anyhow::Result<()>> {
        task::spawn(async move {
            let mut wa = WalAcceptor {
                tli,
                msg_rx,
                reply_tx,
-                conn_id,
            };

            let span_ttid = wa.tli.ttid; // satisfy borrow checker
            wa.run()
-                .instrument(
-                    info_span!("WAL acceptor", cid = %conn_id.unwrap_or(0), ttid = %span_ttid),
-                )
+                .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
                .await
        })
    }
@@ -387,7 +355,7 @@ impl WalAcceptor {
        let _compute_conn_guard = ComputeConnectionGuard {
            timeline: Arc::clone(&self.tli),
        };
-        let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
+        let walreceiver_guard = self.tli.get_walreceivers().register();
        self.tli.update_status_notify().await?;

        // After this timestamp we will stop processing AppendRequests and send a response
@@ -404,7 +372,7 @@ impl WalAcceptor {

            // Update walreceiver state in shmem for reporting.
            if let ProposerAcceptorMessage::Elected(_) = &next_msg {
-                walreceiver_guard.get().status = WalReceiverStatus::Streaming;
+                *walreceiver_guard.get() = WalReceiverState::Streaming;
            }

            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -1,41 +1,17 @@
 //! This module implements pulling WAL from peer safekeepers if compute can't
 //! provide it, i.e. safekeeper lags too much.

-use std::time::SystemTime;
-use std::{fmt, pin::pin, sync::Arc};
+use std::sync::Arc;

-use anyhow::{bail, Context};
-use futures::StreamExt;
-use postgres_protocol::message::backend::ReplicationMessage;
-use tokio::sync::mpsc::{channel, Receiver, Sender};
-use tokio::time::timeout;
-use tokio::{
-    select,
-    time::sleep,
-    time::{self, Duration},
-};
-use tokio_postgres::replication::ReplicationStream;
-use tokio_postgres::types::PgLsn;
-use tracing::*;
-use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
+use tokio::{select, time::sleep, time::Duration};
+use tracing::{info, instrument};

-use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
-use crate::safekeeper::{AppendRequest, AppendRequestHeader};
-use crate::{
-    http::routes::TimelineStatus,
-    receive_wal::MSG_QUEUE_SIZE,
-    safekeeper::{
-        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
-        TermLsn, VoteRequest,
-    },
-    timeline::{PeerInfo, Timeline},
-    SafeKeeperConf,
-};
+use crate::{timeline::Timeline, SafeKeeperConf};

 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
    info!("started");
    let mut cancellation_rx = match tli.get_cancellation_rx() {
        Ok(rx) => rx,
@@ -46,387 +22,19 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    };

    select! {
-        _ = recovery_main_loop(tli, conf) => { unreachable!() }
+        _ = recovery_main_loop(tli) => { unreachable!() }
        _ = cancellation_rx.changed() => {
            info!("stopped");
        }
    }
 }

-/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
-/// fields to explain the choice.
-#[derive(Debug)]
-pub struct RecoveryNeededInfo {
-    /// my term
-    pub term: Term,
-    /// my last_log_term
-    pub last_log_term: Term,
-    /// my flush_lsn
-    pub flush_lsn: Lsn,
-    /// peers from which we can fetch WAL, for observability.
-    pub peers: Vec<PeerInfo>,
-    /// for observability
-    pub num_streaming_computes: usize,
-    pub donors: Vec<Donor>,
-}
-
-// Custom to omit not important fields from PeerInfo.
-impl fmt::Display for RecoveryNeededInfo {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{{")?;
-        write!(
-            f,
-            "term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
-            self.term, self.last_log_term, self.flush_lsn
-        )?;
-        for p in self.peers.iter() {
-            write!(
-                f,
-                "PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
-                p.sk_id, p.term, p.last_log_term, p.flush_lsn
-            )?;
-        }
-        write!(
-            f,
-            "}} num_streaming_computes: {}, donors: {:?}",
-            self.num_streaming_computes, self.donors
-        )
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct Donor {
-    pub sk_id: NodeId,
-    /// equals to last_log_term
-    pub term: Term,
-    pub flush_lsn: Lsn,
-    pub pg_connstr: String,
-    pub http_connstr: String,
-}
-
-impl From<&PeerInfo> for Donor {
-    fn from(p: &PeerInfo) -> Self {
-        Donor {
-            sk_id: p.sk_id,
-            term: p.term,
-            flush_lsn: p.flush_lsn,
-            pg_connstr: p.pg_connstr.clone(),
-            http_connstr: p.http_connstr.clone(),
-        }
-    }
-}
-
 const CHECK_INTERVAL_MS: u64 = 2000;

 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+async fn recovery_main_loop(_tli: Arc<Timeline>) {
    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
    loop {
-        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
-        match recovery_needed_info.donors.first() {
-            Some(donor) => {
-                info!(
-                    "starting recovery from donor {}: {}",
-                    donor.sk_id, recovery_needed_info
-                );
-                match recover(tli.clone(), donor, &conf).await {
-                    // Note: 'write_wal rewrites WAL written before' error is
-                    // expected here and might happen if compute and recovery
-                    // concurrently write the same data. Eventually compute
-                    // should win.
-                    Err(e) => warn!("recovery failed: {:#}", e),
-                    Ok(msg) => info!("recovery finished: {}", msg),
-                }
-            }
-            None => {
-                trace!(
-                    "recovery not needed or not possible: {}",
-                    recovery_needed_info
-                );
-            }
-        }
        sleep(check_duration).await;
    }
 }
-
-/// Recover from the specified donor. Returns message explaining normal finish
-/// reason or error.
-async fn recover(
-    tli: Arc<Timeline>,
-    donor: &Donor,
-    conf: &SafeKeeperConf,
-) -> anyhow::Result<String> {
-    // Learn donor term switch history to figure out starting point.
-    let client = reqwest::Client::new();
-    let timeline_info: TimelineStatus = client
-        .get(format!(
-            "http://{}/v1/tenant/{}/timeline/{}",
-            donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
-        ))
-        .send()
-        .await?
-        .json()
-        .await?;
-    if timeline_info.acceptor_state.term != donor.term {
-        bail!(
-            "donor term changed from {} to {}",
-            donor.term,
-            timeline_info.acceptor_state.term
-        );
-    }
-    // convert from API TermSwitchApiEntry into TermLsn.
-    let donor_th = TermHistory(
-        timeline_info
-            .acceptor_state
-            .term_history
-            .iter()
-            .map(|tl| Into::<TermLsn>::into(*tl))
-            .collect(),
-    );
-
-    // Now understand our term history.
-    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
-    let vote_response = match tli
-        .process_msg(&vote_request)
-        .await
-        .context("VoteRequest handling")?
-    {
-        Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
-        _ => {
-            bail!("unexpected VoteRequest response"); // unreachable
-        }
-    };
-    if vote_response.term != donor.term {
-        bail!(
-            "our term changed from {} to {}",
-            donor.term,
-            vote_response.term
-        );
-    }
-
-    let last_common_point = match TermHistory::find_highest_common_point(
-        &donor_th,
-        &vote_response.term_history,
-        vote_response.flush_lsn,
-    ) {
-        None => bail!(
-            "couldn't find common point in histories, donor {:?}, sk {:?}",
-            donor_th,
-            vote_response.term_history,
-        ),
-        Some(lcp) => lcp,
-    };
-    info!("found last common point at {:?}", last_common_point);
-
-    // truncate WAL locally
-    let pe = ProposerAcceptorMessage::Elected(ProposerElected {
-        term: donor.term,
-        start_streaming_at: last_common_point.lsn,
-        term_history: donor_th,
-        timeline_start_lsn: Lsn::INVALID,
-    });
-    // Successful ProposerElected handling always returns None. If term changed,
-    // we'll find out that during the streaming. Note: it is expected to get
-    // 'refusing to overwrite correct WAL' here if walproposer reconnected
-    // concurrently, restart helps here.
-    tli.process_msg(&pe)
-        .await
-        .context("ProposerElected handling")?;
-
-    recovery_stream(tli, donor, last_common_point.lsn, conf).await
-}
-
-// Pull WAL from donor, assuming handshake is already done.
-async fn recovery_stream(
-    tli: Arc<Timeline>,
-    donor: &Donor,
-    start_streaming_at: Lsn,
-    conf: &SafeKeeperConf,
-) -> anyhow::Result<String> {
-    // TODO: pass auth token
-    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
-    let mut cfg = cfg.to_tokio_postgres_config();
-    // It will make safekeeper give out not committed WAL (up to flush_lsn).
-    cfg.application_name(&format!("safekeeper_{}", conf.my_id));
-    cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
-
-    let connect_timeout = Duration::from_millis(10000);
-    let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
-        .await
-    {
-        Ok(client_and_conn) => client_and_conn?,
-        Err(_elapsed) => {
-            bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
-        }
-    };
-    trace!("connected to {:?}", donor);
-
-    // The connection object performs the actual communication with the
-    // server, spawn it off to run on its own.
-    let ttid = tli.ttid;
-    tokio::spawn(async move {
-        if let Err(e) = connection
-            .instrument(info_span!("recovery task connection poll", ttid = %ttid))
-            .await
-        {
-            // This logging isn't very useful as error is anyway forwarded to client.
-            trace!(
-                "tokio_postgres connection object finished with error: {}",
-                e
-            );
-        }
-    });
-
-    let query = format!(
-        "START_REPLICATION PHYSICAL {} (term='{}')",
-        start_streaming_at, donor.term
-    );
-
-    let copy_stream = client.copy_both_simple(&query).await?;
-    let physical_stream = ReplicationStream::new(copy_stream);
-
-    // As in normal walreceiver, do networking and writing to disk in parallel.
-    let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
-    let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
-    let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
-
-    let res = tokio::select! {
-        r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
-        r = read_replies(reply_rx, donor.term) => r.map(|()| None),
-    };
-
-    // Join the spawned WalAcceptor. At this point chans to/from it passed to
-    // network routines are dropped, so it will exit as soon as it touches them.
-    match wa.await {
-        Ok(Ok(())) => {
-            // WalAcceptor finished normally, termination reason is different
-            match res {
-                Ok(Some(success_desc)) => Ok(success_desc),
-                Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
-                Err(e) => Err(e), // network error or term change
-            }
-        }
-        Ok(Err(e)) => Err(e), // error while processing message
-        Err(e) => bail!("WalAcceptor panicked: {}", e),
-    }
-}
-
-// Perform network part of streaming: read data and push it to msg_tx, send KA
-// to make sender hear from us. If there is nothing coming for a while, check
-// for termination.
-// Returns
-// - Ok(None) if channel to WalAcceptor closed -- its task should return error.
-// - Ok(Some(String)) if recovery successfully completed.
-// - Err if error happened while reading/writing to socket.
-async fn network_io(
-    physical_stream: ReplicationStream,
-    msg_tx: Sender<ProposerAcceptorMessage>,
-    donor: Donor,
-    tli: Arc<Timeline>,
-    conf: SafeKeeperConf,
-) -> anyhow::Result<Option<String>> {
-    let mut physical_stream = pin!(physical_stream);
-    let mut last_received_lsn = Lsn::INVALID;
-    // tear down connection if no data arrives withing this period
-    let no_data_timeout = Duration::from_millis(30000);
-
-    loop {
-        let msg = match timeout(no_data_timeout, physical_stream.next()).await {
-            Ok(next) => match next {
-                None => bail!("unexpected end of replication stream"),
-                Some(msg) => msg.context("get replication message")?,
-            },
-            Err(_) => bail!("no message received within {:?}", no_data_timeout),
-        };
-
-        match msg {
-            ReplicationMessage::XLogData(xlog_data) => {
-                let ar_hdr = AppendRequestHeader {
-                    term: donor.term,
-                    epoch_start_lsn: Lsn::INVALID, // unused
-                    begin_lsn: Lsn(xlog_data.wal_start()),
-                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
-                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
-                    truncate_lsn: Lsn::INVALID, // do not attempt to advance
-                    proposer_uuid: [0; 16],
-                };
-                let ar = AppendRequest {
-                    h: ar_hdr,
-                    wal_data: xlog_data.into_data(),
-                };
-                trace!(
-                    "processing AppendRequest {}-{}, len {}",
-                    ar.h.begin_lsn,
-                    ar.h.end_lsn,
-                    ar.wal_data.len()
-                );
-                last_received_lsn = ar.h.end_lsn;
-                if msg_tx
-                    .send(ProposerAcceptorMessage::AppendRequest(ar))
-                    .await
-                    .is_err()
-                {
-                    return Ok(None); // chan closed, WalAcceptor terminated
-                }
-            }
-            ReplicationMessage::PrimaryKeepAlive(_) => {
-                // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
-                // do current donors still contain one we currently connected to?
-                if !recovery_needed_info
-                    .donors
-                    .iter()
-                    .any(|d| d.sk_id == donor.sk_id)
-                {
-                    // Most likely it means we are caughtup.
-                    // note: just exiting makes tokio_postgres send CopyFail to the far end.
-                    return Ok(Some(format!(
-                        "terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
-                        last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
-                    )));
-                }
-            }
-            _ => {}
-        }
-        // Send reply to each message to keep connection alive. Ideally we
-        // should do that once in a while instead, but this again requires
-        // stream split or similar workaround, and recovery is anyway not that
-        // performance critical.
-        //
-        // We do not know here real write/flush LSNs (need to take mutex again
-        // or check replies which are read in different future), but neither
-        // sender much cares about them, so just send last received.
-        physical_stream
-            .as_mut()
-            .standby_status_update(
-                PgLsn::from(last_received_lsn.0),
-                PgLsn::from(last_received_lsn.0),
-                PgLsn::from(last_received_lsn.0),
-                SystemTime::now(),
-                0,
-            )
-            .await?;
-    }
-}
-
-// Read replies from WalAcceptor. We are not interested much in sending them to
-// donor safekeeper, so don't route them anywhere. However, we should check if
-// term changes and exit if it does.
-// Returns Ok(()) if channel closed, Err in case of term change.
-async fn read_replies(
-    mut reply_rx: Receiver<AcceptorProposerMessage>,
-    donor_term: Term,
-) -> anyhow::Result<()> {
-    loop {
-        match reply_rx.recv().await {
-            Some(msg) => {
-                if let AcceptorProposerMessage::AppendResponse(ar) = msg {
-                    if ar.term != donor_term {
-                        bail!("donor term changed from {} to {}", donor_term, ar.term);
-                    }
-                }
-            }
-            None => return Ok(()), // chan closed, WalAcceptor terminated
-        }
-    }
-}
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -91,69 +91,6 @@ impl TermHistory {
        }
        TermHistory(res)
    }
-
-    /// Find point of divergence between leader (walproposer) term history and
-    /// safekeeper. Arguments are not symmetrics as proposer history ends at
-    /// +infinity while safekeeper at flush_lsn.
-    /// C version is at walproposer SendProposerElected.
-    pub fn find_highest_common_point(
-        prop_th: &TermHistory,
-        sk_th: &TermHistory,
-        sk_wal_end: Lsn,
-    ) -> Option<TermLsn> {
-        let (prop_th, sk_th) = (&prop_th.0, &sk_th.0); // avoid .0 below
-
-        if let Some(sk_th_last) = sk_th.last() {
-            assert!(
-                sk_th_last.lsn <= sk_wal_end,
-                "safekeeper term history end {:?} LSN is higher than WAL end {:?}",
-                sk_th_last,
-                sk_wal_end
-            );
-        }
-
-        // find last common term, if any...
-        let mut last_common_idx = None;
-        for i in 0..min(sk_th.len(), prop_th.len()) {
-            if prop_th[i].term != sk_th[i].term {
-                break;
-            }
-            // If term is the same, LSN must be equal as well.
-            assert!(
-                prop_th[i].lsn == sk_th[i].lsn,
-                "same term {} has different start LSNs: prop {}, sk {}",
-                prop_th[i].term,
-                prop_th[i].lsn,
-                sk_th[i].lsn
-            );
-            last_common_idx = Some(i);
-        }
-        let last_common_idx = match last_common_idx {
-            None => return None, // no common point
-            Some(lci) => lci,
-        };
-        // Now find where it ends at both prop and sk and take min. End of
-        // (common) term is the start of the next except it is the last one;
-        // there it is flush_lsn in case of safekeeper or, in case of proposer
-        // +infinity, so we just take flush_lsn then.
-        if last_common_idx == prop_th.len() - 1 {
-            Some(TermLsn {
-                term: prop_th[last_common_idx].term,
-                lsn: sk_wal_end,
-            })
-        } else {
-            let prop_common_term_end = prop_th[last_common_idx + 1].lsn;
-            let sk_common_term_end = if last_common_idx + 1 < sk_th.len() {
-                sk_th[last_common_idx + 1].lsn
-            } else {
-                sk_wal_end
-            };
-            Some(TermLsn {
-                term: prop_th[last_common_idx].term,
-                lsn: min(prop_common_term_end, sk_common_term_end),
-            })
-        }
-    }
 }

 /// Display only latest entries for Debug.
@@ -368,19 +305,19 @@ pub struct AcceptorGreeting {
 /// Vote request sent from proposer to safekeepers
 #[derive(Debug, Deserialize)]
 pub struct VoteRequest {
-    pub term: Term,
+    term: Term,
 }

 /// Vote itself, sent from safekeeper to proposer
 #[derive(Debug, Serialize)]
 pub struct VoteResponse {
-    pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
+    term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    vote_given: u64, // fixme u64 due to padding
    // Safekeeper flush_lsn (end of WAL) + history of term switches allow
    // proposer to choose the most advanced one.
-    pub flush_lsn: Lsn,
+    flush_lsn: Lsn,
    truncate_lsn: Lsn,
-    pub term_history: TermHistory,
+    term_history: TermHistory,
    timeline_start_lsn: Lsn,
 }

@@ -407,8 +344,7 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
-    // TODO: remove this field, it in unused -- LSN of term switch can be taken
-    // from ProposerElected (as well as from term history).
+    // LSN since the proposer appends WAL; determines epoch switch point.
    pub epoch_start_lsn: Lsn,
    /// start position of message in WAL
    pub begin_lsn: Lsn,
@@ -823,7 +759,7 @@ where
            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                   msg.term, self.flush_lsn(), msg.start_streaming_at)
        }
-        // Otherwise we must never attempt to truncate committed data.
+        // Otherwise this shouldn't happen.
        assert!(
            msg.start_streaming_at >= self.inmem.commit_lsn,
            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
@@ -874,14 +810,6 @@ where

        info!("start receiving WAL since {:?}", msg.start_streaming_at);

-        // Cache LSN where term starts to immediately fsync control file with
-        // commit_lsn once we reach it -- sync-safekeepers finishes when
-        // persisted commit_lsn on majority of safekeepers aligns.
-        self.epoch_start_lsn = match msg.term_history.0.last() {
-            None => bail!("proposer elected with empty term history"),
-            Some(term_lsn_start) => term_lsn_start.lsn,
-        };
-
        Ok(None)
    }

@@ -907,7 +835,10 @@ where
        // file: walproposer in sync mode is very interested when this
        // happens. Note: this is for sync-safekeepers mode only, as
        // otherwise commit_lsn might jump over epoch_start_lsn.
-        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
+        // Also note that commit_lsn can reach epoch_start_lsn earlier
+        // that we receive new epoch_start_lsn, and we still need to sync
+        // control file in this case.
+        if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
            self.persist_control_file(self.state.clone()).await?;
        }

@@ -971,6 +902,7 @@ where
        // Now we know that we are in the same term as the proposer,
        // processing the message.

+        self.epoch_start_lsn = msg.h.epoch_start_lsn;
        self.inmem.proposer_uuid = msg.h.proposer_uuid;

        // do the job
@@ -1253,65 +1185,4 @@ mod tests {
        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
        assert_eq!(sk.get_epoch(), 1);
    }
-
-    #[test]
-    fn test_find_highest_common_point_none() {
-        let prop_th = TermHistory(vec![(0, Lsn(1)).into()]);
-        let sk_th = TermHistory(vec![(1, Lsn(1)).into(), (2, Lsn(2)).into()]);
-        assert_eq!(
-            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(3),),
-            None
-        );
-    }
-
-    #[test]
-    fn test_find_highest_common_point_middle() {
-        let prop_th = TermHistory(vec![
-            (1, Lsn(10)).into(),
-            (2, Lsn(20)).into(),
-            (4, Lsn(40)).into(),
-        ]);
-        let sk_th = TermHistory(vec![
-            (1, Lsn(10)).into(),
-            (2, Lsn(20)).into(),
-            (3, Lsn(30)).into(), // sk ends last common term 2 at 30
-        ]);
-        assert_eq!(
-            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(40),),
-            Some(TermLsn {
-                term: 2,
-                lsn: Lsn(30),
-            })
-        );
-    }
-
-    #[test]
-    fn test_find_highest_common_point_sk_end() {
-        let prop_th = TermHistory(vec![
-            (1, Lsn(10)).into(),
-            (2, Lsn(20)).into(), // last common term 2, sk will end it at 32 sk_end_lsn
-            (4, Lsn(40)).into(),
-        ]);
-        let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
-        assert_eq!(
-            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
-            Some(TermLsn {
-                term: 2,
-                lsn: Lsn(32),
-            })
-        );
-    }
-
-    #[test]
-    fn test_find_highest_common_point_walprop() {
-        let prop_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
-        let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
-        assert_eq!(
-            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
-            Some(TermLsn {
-                term: 2,
-                lsn: Lsn(32),
-            })
-        );
-    }
 }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -418,11 +418,10 @@ impl SafekeeperPostgresHandler {
        }

        info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}",
            start_pos,
            end_pos,
-            matches!(end_watch, EndWatch::Flush(_)),
-            appname
+            matches!(end_watch, EndWatch::Flush(_))
        );

        // switch to copy
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -11,7 +11,6 @@ use tokio::fs;
 use serde_with::DisplayFromStr;
 use std::cmp::max;
 use std::sync::Arc;
-use std::time::Duration;
 use tokio::sync::{Mutex, MutexGuard};
 use tokio::{
    sync::{mpsc::Sender, watch},
@@ -28,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
-use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
+use crate::recovery::recovery_main;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
    SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
@@ -46,12 +45,11 @@ use crate::{debug_dump, wal_storage};
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PeerInfo {
    pub sk_id: NodeId,
-    pub term: Term,
    /// Term of the last entry.
-    pub last_log_term: Term,
+    _last_log_term: Term,
    /// LSN of the last record.
    #[serde_as(as = "DisplayFromStr")]
-    pub flush_lsn: Lsn,
+    _flush_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
@@ -63,21 +61,16 @@ pub struct PeerInfo {
    #[serde(skip)]
    #[serde(default = "Instant::now")]
    ts: Instant,
-    pub pg_connstr: String,
-    pub http_connstr: String,
 }

 impl PeerInfo {
    fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
        PeerInfo {
            sk_id: NodeId(sk_info.safekeeper_id),
-            term: sk_info.term,
-            last_log_term: sk_info.last_log_term,
-            flush_lsn: Lsn(sk_info.flush_lsn),
+            _last_log_term: sk_info.last_log_term,
+            _flush_lsn: Lsn(sk_info.flush_lsn),
            commit_lsn: Lsn(sk_info.commit_lsn),
            local_start_lsn: Lsn(sk_info.local_start_lsn),
-            pg_connstr: sk_info.safekeeper_connstr.clone(),
-            http_connstr: sk_info.http_connstr.clone(),
            ts,
        }
    }
@@ -119,6 +112,7 @@ pub struct SharedState {
    /// TODO: it might be better to remove tli completely from GlobalTimelines
    /// when tli is inactive instead of having this flag.
    active: bool,
+    num_computes: u32,
    last_removed_segno: XLogSegNo,
 }

@@ -157,6 +151,7 @@ impl SharedState {
            peers_info: PeersInfo(vec![]),
            wal_backup_active: false,
            active: false,
+            num_computes: 0,
            last_removed_segno: 0,
        })
    }
@@ -176,6 +171,7 @@ impl SharedState {
            peers_info: PeersInfo(vec![]),
            wal_backup_active: false,
            active: false,
+            num_computes: 0,
            last_removed_segno: 0,
        })
    }
@@ -223,7 +219,7 @@ impl SharedState {
            };
            trace!(
                "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
-                self.sk.state.timeline_id, action_pending, num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn
+                self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn
            );
        }
        res
@@ -269,20 +265,6 @@ impl SharedState {
            availability_zone: conf.availability_zone.clone(),
        }
    }
-
-    /// Get our latest view of alive peers status on the timeline.
-    /// We pass our own info through the broker as well, so when we don't have connection
-    /// to the broker returned vec is empty.
-    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
-        let now = Instant::now();
-        self.peers_info
-            .0
-            .iter()
-            // Regard peer as absent if we haven't heard from it within heartbeat_timeout.
-            .filter(|p| now.duration_since(p.ts) <= heartbeat_timeout)
-            .cloned()
-            .collect()
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -464,9 +446,7 @@ impl Timeline {
    /// Bootstrap new or existing timeline starting background stasks.
    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
        // Start recovery task which always runs on the timeline.
-        if conf.peer_recovery_enabled {
-            tokio::spawn(recovery_main(self.clone(), conf.clone()));
-        }
+        tokio::spawn(recovery_main(self.clone(), conf.clone()));
    }

    /// Delete timeline from disk completely, by removing timeline directory. Background
@@ -551,7 +531,7 @@ impl Timeline {
            return true;
        }
        let shared_state = self.write_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
+        if shared_state.num_computes == 0 {
            return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
            reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
        }
@@ -700,88 +680,20 @@ impl Timeline {
        Ok(())
    }

+    /// Get our latest view of alive peers status on the timeline.
+    /// We pass our own info through the broker as well, so when we don't have connection
+    /// to the broker returned vec is empty.
    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
        let shared_state = self.write_shared_state().await;
-        shared_state.get_peers(conf.heartbeat_timeout)
-    }
-
-    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
-    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-    /// to fetch, and we can do that without running elections; 2) there is no
-    /// actively streaming compute, as we don't want to compete with it.
-    ///
-    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-    /// to its last_log_term so we are sure such a leader ever had been elected.
-    ///
-    /// All possible donors are returned so that we could keep connection to the
-    /// current one if it is good even if it slightly lags behind.
-    ///
-    /// Note that term conditions above might be not met, but safekeepers are
-    /// still not aligned on last flush_lsn. Generally in this case until
-    /// elections are run it is not possible to say which safekeeper should
-    /// recover from which one -- history which would be committed is different
-    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-    /// Thus we don't try to predict it here.
-    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.write_shared_state().await;
-        let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_epoch();
-        let flush_lsn = ss.sk.flush_lsn();
-        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-        let mut peers = ss.get_peers(heartbeat_timeout);
-        // Sort by <last log term, lsn> pairs.
-        peers.sort_by(|p1, p2| {
-            let tl1 = TermLsn {
-                term: p1.last_log_term,
-                lsn: p1.flush_lsn,
-            };
-            let tl2 = TermLsn {
-                term: p2.last_log_term,
-                lsn: p2.flush_lsn,
-            };
-            tl2.cmp(&tl1) // desc
-        });
-        let num_streaming_computes = self.walreceivers.get_num_streaming();
-        let donors = if num_streaming_computes > 0 {
-            vec![] // If there is a streaming compute, don't try to recover to not intervene.
-        } else {
-            peers
-                .iter()
-                .filter_map(|candidate| {
-                    // Are we interested in this candidate?
-                    let candidate_tl = TermLsn {
-                        term: candidate.last_log_term,
-                        lsn: candidate.flush_lsn,
-                    };
-                    let my_tl = TermLsn {
-                        term: last_log_term,
-                        lsn: flush_lsn,
-                    };
-                    if my_tl < candidate_tl {
-                        // Yes, we are interested. Can we pull from it without
-                        // (re)running elections? It is possible if 1) his term
-                        // is equal to his last_log_term so we could act on
-                        // behalf of leader of this term (we must be sure he was
-                        // ever elected) and 2) our term is not higher, or we'll refuse data.
-                        if candidate.term == candidate.last_log_term && candidate.term >= term {
-                            Some(Donor::from(candidate))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-                .collect()
-        };
-        RecoveryNeededInfo {
-            term,
-            last_log_term,
-            flush_lsn,
-            peers,
-            num_streaming_computes,
-            donors,
-        }
+        let now = Instant::now();
+        shared_state
+            .peers_info
+            .0
+            .iter()
+            // Regard peer as absent if we haven't heard from it within heartbeat_timeout.
+            .filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
+            .cloned()
+            .collect()
    }

    pub fn get_walsenders(&self) -> &Arc<WalSenders> {
@@ -853,7 +765,7 @@ impl Timeline {
                ps_feedback,
                wal_backup_active: state.wal_backup_active,
                timeline_is_active: state.active,
-                num_computes: self.walreceivers.get_num() as u32,
+                num_computes: state.num_computes,
                last_removed_segno: state.last_removed_segno,
                epoch_start_lsn: state.sk.epoch_start_lsn,
                mem_state: state.sk.inmem.clone(),
@@ -880,7 +792,7 @@ impl Timeline {
            walsenders: self.walsenders.get_all(),
            wal_backup_active: state.wal_backup_active,
            active: state.active,
-            num_computes: self.walreceivers.get_num() as u32,
+            num_computes: state.num_computes,
            last_removed_segno: state.last_removed_segno,
            epoch_start_lsn: state.sk.epoch_start_lsn,
            mem_state: state.sk.inmem.clone(),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -357,12 +357,6 @@ class PgProtocol:
                        result.append(cur.fetchall())
        return result

-    def safe_psql_scalar(self, query) -> Any:
-        """
-        Execute query returning single row with single column.
-        """
-        return self.safe_psql(query)[0][0]
-

@dataclass
 class AuthKeys:
@@ -1638,9 +1632,6 @@ class NeonPageserver(PgProtocol):
            # these can happen during shutdown, but it should not be a reason to fail a test
            ".*completed, took longer than expected.*",
            '.*registered custom resource manager "neon".*',
-            # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
-            # and it is not a failure of our code when it happens.
-            ".*DeleteObjects.*We encountered an internal error. Please try again.*",
        ]

    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
@@ -2583,13 +2574,6 @@ class Endpoint(PgProtocol):
    ):
        self.stop()

-    # Checkpoints running endpoint and returns pg_wal size in MB.
-    def get_pg_wal_size(self):
-        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
-        self.safe_psql("checkpoint")
-        assert self.pgdata_dir is not None  # please mypy
-        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
-

 class EndpointFactory:
    """An object representing multiple compute endpoints."""
@@ -2771,27 +2755,6 @@ class Safekeeper:
    def data_dir(self) -> str:
        return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")

-    def timeline_dir(self, tenant_id, timeline_id) -> str:
-        return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
-
-    def list_segments(self, tenant_id, timeline_id) -> List[str]:
-        """
-        Get list of segment names of the given timeline.
-        """
-        tli_dir = self.timeline_dir(tenant_id, timeline_id)
-        segments = []
-        for _, _, filenames in os.walk(tli_dir):
-            segments.extend([f for f in filenames if f != "safekeeper.control"])
-        segments.sort()
-        return segments
-
-
-# Walreceiver as returned by sk's timeline status endpoint.
-@dataclass
-class Walreceiver:
-    conn_id: int
-    state: str
-

@dataclass
 class SafekeeperTimelineStatus:
@@ -2803,7 +2766,6 @@ class SafekeeperTimelineStatus:
    backup_lsn: Lsn
    peer_horizon_lsn: Lsn
    remote_consistent_lsn: Lsn
-    walreceivers: List[Walreceiver]


@dataclass
@@ -2865,7 +2827,6 @@ class SafekeeperHttpClient(requests.Session):
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
        res.raise_for_status()
        resj = res.json()
-        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        return SafekeeperTimelineStatus(
            acceptor_epoch=resj["acceptor_state"]["epoch"],
            pg_version=resj["pg_info"]["pg_version"],
@@ -2875,7 +2836,6 @@ class SafekeeperHttpClient(requests.Session):
            backup_lsn=Lsn(resj["backup_lsn"]),
            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
-            walreceivers=walreceivers,
        )

    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
@@ -3159,22 +3119,6 @@ def check_restored_datadir_content(
    assert (mismatch, error) == ([], [])


-def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
-    """Wait logical replication subscriber to sync with publisher."""
-    publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-    while True:
-        res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][
-            0
-        ]
-        if res:
-            log.info(f"subscriber_lsn={res}")
-            subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
-            if subscriber_lsn >= publisher_lsn:
-                return subscriber_lsn
-        time.sleep(0.5)
-
-
 def wait_for_last_flush_lsn(
    env: NeonEnv,
    endpoint: Endpoint,
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -453,15 +453,6 @@ class PageserverHttpClient(requests.Session):
        res_json = res.json()
        return res_json

-    def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
-        log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}",
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        return res_json
-
    def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
        self.is_testing_enabled_or_skip()

--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,43 +0,0 @@
-import time
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, PgBin, logical_replication_sync
-
-
-@pytest.mark.timeout(1000)
-def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
-    env = neon_simple_env
-
-    env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start("test_logical_replication")
-
-    log.info("postgres is running on 'test_logical_replication' branch")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
-
-    endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
-
-    # now start subscriber
-    vanilla_pg.start()
-    pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()])
-
-    vanilla_pg.safe_psql("truncate table pgbench_accounts")
-    vanilla_pg.safe_psql("truncate table pgbench_history")
-
-    connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-
-    # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
-
-    pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
-
-    # Wait logical replication to sync
-    start = time.time()
-    logical_replication_sync(vanilla_pg, endpoint)
-    log.info(f"Sync with master took {time.time() - start} seconds")
-
-    sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
-    sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
-    assert sum_master == sum_replica
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,8 +9,8 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.33", features=["rt", "macros"] }
-tokio-postgres = "0.7.10"
+tokio = { version = "1.28", features=["rt", "macros"] }
+tokio-postgres = "0.7.8"


 # This is not part of the main 'neon' workspace
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.73
+FROM rust:1.70
 WORKDIR /source

 COPY . .
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -311,8 +311,8 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
        assert isinstance(failed, Exception)
        assert isinstance(succeeded, Dict)

-        # there's multiple valid status codes:
-        # - Timeline x/y already exists
+        # FIXME: there's probably multiple valid status codes:
+        # - Timeline 62505b9a9f6b1d29117b1b74eaf07b12/56cd19d3b2dbcc65e9d53ec6ca304f24 already exists
        # - whatever 409 response says, but that is a subclass of PageserverApiException
        assert isinstance(failed, PageserverApiException)
        assert succeeded["state"] == "Active"
@@ -320,14 +320,17 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
        # we might still have the failpoint active
        env.pageserver.stop(immediate=True)

+        # pytest should nag if we leave threads unjoined
        for t in threads:
            t.join()
        create_root.join()


-def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
+def test_non_uploaded_branch_availability_after_restart(neon_env_builder: NeonEnvBuilder):
    """
-    Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.
+    Currently before RFC#27 we keep and continue uploading branches which were not successfully uploaded before shutdown.
+
+    This test likely duplicates some other test, but it's easier to write one than to make sure there will be a failing test when the rfc is implemented.
    """

    env = neon_env_builder.init_configs()
@@ -363,59 +366,9 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N

    wait_until_tenant_active(ps_http, env.initial_tenant)

-    with pytest.raises(PageserverApiException, match="not found"):
-        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
-
-
-def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
-    """
-    Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.
-    """
-
-    env = neon_env_builder.init_configs()
-    env.start()
-
-    env.pageserver.allowed_errors.append(
-        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
-    )
-    ps_http = env.pageserver.http_client()
-
-    ps_http.tenant_create(env.initial_tenant)
-    ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
-
-    # pause all uploads
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-    branch_id = TimelineId.generate()
-
-    def start_creating_timeline():
-        with pytest.raises(RequestException):
-            ps_http.timeline_create(
-                env.pg_version,
-                env.initial_tenant,
-                branch_id,
-                ancestor_timeline_id=env.initial_timeline,
-                timeout=60,
-            )
-
-    t = threading.Thread(target=start_creating_timeline)
-    try:
-        t.start()
-
-        wait_until_paused(env, "before-upload-index-pausable")
-    finally:
-        # FIXME: paused uploads bother shutdown
-        env.pageserver.stop(immediate=True)
-        t.join()
-
-    # now without a failpoint
-    env.pageserver.start()
-
-    wait_until_tenant_active(ps_http, env.initial_tenant)
-
-    ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
-
-    with pytest.raises(PageserverApiException, match="not found"):
-        ps_http.timeline_detail(env.initial_tenant, branch_id)
+    # currently it lives on and will get eventually uploaded, but this will change
+    detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+    assert detail["state"] == "Active"


 def wait_until_paused(env: NeonEnv, failpoint: str):
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -1,9 +1,9 @@
 import time

 import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
 from fixtures.pageserver.utils import (
-    wait_for_last_record_lsn,
    wait_for_upload_queue_empty,
    wait_until_tenant_active,
 )
@@ -41,12 +41,9 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

 def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    """
-    Test sets fail point at the end of first compaction phase: after
-    flushing new L1 layer but before deletion of L0 layers.
-
-    The L1 used to be overwritten, but with crash-consistency via remote
-    index_part.json, we end up deleting the not yet uploaded L1 layer on
-    startup.
+    This test sets fail point at the end of first compaction phase:
+    after flushing new L1 layers but before deletion of L0 layers
+    it should cause generation of duplicate L1 layer by compaction after restart.
    """
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)

@@ -68,8 +65,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
    connstr = endpoint.connstr(options="-csynchronous_commit=off")
    pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])

-    lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop()
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    # make sure we receive no new wal after this, so that we'll write over the same L1 file.
    endpoint.stop()
@@ -78,7 +74,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)

    # hit the exit failpoint
    with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
    env.pageserver.stop()

    # now the duplicate L1 has been created, but is not yet uploaded
@@ -111,32 +107,33 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
        l1_found = path

    assert l1_found is not None, "failed to find L1 locally"
+    original_created_at = l1_found.stat()[8]

    uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
    assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"

+    # give room for fs timestamps
+    time.sleep(1)
+
    env.pageserver.start()
    wait_until_tenant_active(pageserver_http, tenant_id)

-    assert not l1_found.exists(), "partial compaction result should had been removed during startup"
-
-    # wait for us to catch up again
-    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    message = f".*duplicated L1 layer layer={l1_found.name}"
+    env.pageserver.allowed_errors.append(message)

    pageserver_http.timeline_compact(tenant_id, timeline_id)
-
    # give time for log flush
    time.sleep(1)

-    message = f".*duplicated L1 layer layer={l1_found.name}"
    found_msg = env.pageserver.log_contains(message)
-    # resident or evicted, it should not be overwritten, however it should had been non-existing at startup
-    assert (
-        found_msg is None
-    ), "layer should had been removed during startup, did it live on as evicted?"
+    assert found_msg is not None, "no layer was duplicated, has this been fixed already?"

-    assert l1_found.exists(), "the L1 reappears"
+    log.info(f"found log line: {found_msg}")
+
+    overwritten_at = l1_found.stat()[8]
+    assert original_created_at < overwritten_at, "expected the L1 to be overwritten"

    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)

-    assert uploaded.exists(), "the L1 is uploaded"
+    uploaded_at = uploaded.stat()[8]
+    assert overwritten_at <= uploaded_at, "expected the L1 to finally be uploaded"
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,6 +2,7 @@ import asyncio
 import concurrent.futures
 import random

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    Endpoint,
@@ -94,12 +95,13 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):


 #
-def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    num_index_uploads = 0

-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start()
    tenant_id = env.initial_tenant
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -1,5 +1,6 @@
 import time

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
@@ -13,10 +14,12 @@ from fixtures.utils import query_scalar

 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
 # and then download them back.
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_basic_eviction(
    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    env = neon_env_builder.init_start(
        initial_tenant_conf={
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,149 +0,0 @@
-import time
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    logical_replication_sync,
-    wait_for_last_flush_lsn,
-)
-
-
-def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
-    env = neon_simple_env
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_logical_replication", config_lines=["log_statement=all"]
-    )
-
-    log.info("postgres is running on 'test_logical_replication' branch")
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("create table t(pk integer primary key, payload integer)")
-    cur.execute(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
-    )
-    cur.execute("create publication pub1 for table t, replication_example")
-
-    # now start subscriber
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
-    vanilla_pg.safe_psql(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
-    )
-    connstr = endpoint.connstr().replace("'", "''")
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-
-    # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
-
-    # insert some data
-    cur.execute("insert into t values (generate_series(1,1000), 0)")
-
-    # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
-    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000
-
-    # now stop subscriber...
-    vanilla_pg.stop()
-
-    # ... and insert some more data which should be delivered to subscriber after restart
-    cur.execute("insert into t values (generate_series(1001,2000), 0)")
-
-    # Restart compute
-    endpoint.stop()
-    endpoint.start()
-
-    # start subscriber
-    vanilla_pg.start()
-
-    # Wait logical replication to sync
-    logical_replication_sync(vanilla_pg, endpoint)
-
-    # Check that subscribers receives all data
-    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000
-
-    # Test that save/restore of RewriteMappingFile works. Partial copy of
-    # rewrite.sql test.
-    log.info("checking rewriteheap")
-    vanilla_pg.stop()
-    cmds = """
-INSERT INTO replication_example(somedata) VALUES (1);
-
-BEGIN;
-INSERT INTO replication_example(somedata) VALUES (2);
-ALTER TABLE replication_example ADD COLUMN testcolumn1 int;
-INSERT INTO replication_example(somedata, testcolumn1) VALUES (3,  1);
-COMMIT;
-
-BEGIN;
-INSERT INTO replication_example(somedata) VALUES (3);
-ALTER TABLE replication_example ADD COLUMN testcolumn2 int;
-INSERT INTO replication_example(somedata, testcolumn1, testcolumn2) VALUES (4,  2, 1);
-COMMIT;
-
-VACUUM FULL pg_am;
-VACUUM FULL pg_amop;
-VACUUM FULL pg_proc;
-VACUUM FULL pg_opclass;
-VACUUM FULL pg_type;
-VACUUM FULL pg_index;
-VACUUM FULL pg_database;
-
-
-- repeated rewrites that fail
-BEGIN;
-CLUSTER pg_class USING pg_class_oid_index;
-CLUSTER pg_class USING pg_class_oid_index;
-ROLLBACK;
-
-- repeated rewrites that succeed
-BEGIN;
-CLUSTER pg_class USING pg_class_oid_index;
-CLUSTER pg_class USING pg_class_oid_index;
-CLUSTER pg_class USING pg_class_oid_index;
-COMMIT;
-
-- repeated rewrites in different transactions
-VACUUM FULL pg_class;
-VACUUM FULL pg_class;
-
-- reindexing of important relations / indexes
-REINDEX TABLE pg_class;
-REINDEX INDEX pg_class_oid_index;
-REINDEX INDEX pg_class_tblspc_relfilenode_index;
-
-INSERT INTO replication_example(somedata, testcolumn1) VALUES (5, 3);
-
-BEGIN;
-INSERT INTO replication_example(somedata, testcolumn1) VALUES (6, 4);
-ALTER TABLE replication_example ADD COLUMN testcolumn3 int;
-INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (7, 5, 1);
-COMMIT;
-"""
-    endpoint.safe_psql_many([q for q in cmds.splitlines() if q != "" and not q.startswith("-")])
-
-    # refetch rewrite files from pageserver
-    endpoint.stop()
-    endpoint.start()
-
-    vanilla_pg.start()
-    logical_replication_sync(vanilla_pg, endpoint)
-    eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3"
-    assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q)
-    log.info("rewriteheap synced")
-
-    # test that removal of repl slots works across restart
-    vanilla_pg.stop()
-    time.sleep(1)  # wait for conn termination; active slots can't be dropped
-    endpoint.safe_psql("select pg_drop_replication_slot('sub1');")
-    endpoint.safe_psql("insert into t values (2001, 1);")  # forces WAL flush
-    # wait for drop message to reach safekeepers (it is not transactional)
-    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop()
-    endpoint.start()
-    # it must be gone (but walproposer slot still exists, hence 1)
-    assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,10 +1,7 @@
-import time
-from datetime import datetime, timedelta, timezone
+from datetime import timedelta

 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
-from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar


@@ -28,14 +25,13 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
    cur.execute("CREATE TABLE foo (x integer)")
    tbl = []
    for i in range(1000):
-        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
+        cur.execute(f"INSERT INTO foo VALUES({i})")
        # Get the timestamp at UTC
        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
        tbl.append([i, after_timestamp])

    # Execute one more transaction with synchronous_commit enabled, to flush
    # all the previous transactions
-    cur.execute("SET synchronous_commit=on")
    cur.execute("INSERT INTO foo VALUES (-1)")

    # Wait until WAL is received by pageserver
@@ -71,100 +67,3 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i

            endpoint_here.stop_and_destroy()
-
-
-# Test pageserver get_timestamp_of_lsn API
-def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-
-    new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
-    endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api")
-    log.info("postgres is running on 'test_ts_of_lsn_api' branch")
-
-    cur = endpoint_main.connect().cursor()
-    # Create table, and insert rows, each in a separate transaction
-    # Disable synchronous_commit to make this initialization go faster.
-    #
-    # Each row contains current insert LSN and the current timestamp, when
-    # the row was inserted.
-    cur.execute("SET synchronous_commit=off")
-    cur.execute("CREATE TABLE foo (x integer)")
-    tbl = []
-    for i in range(1000):
-        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
-        # Get the timestamp at UTC
-        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc)
-        after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
-        tbl.append([i, after_timestamp, after_lsn])
-        time.sleep(0.005)
-
-    # Execute one more transaction with synchronous_commit enabled, to flush
-    # all the previous transactions
-    cur.execute("SET synchronous_commit=on")
-    cur.execute("INSERT INTO foo VALUES (-1)")
-
-    # Wait until WAL is received by pageserver
-    last_flush_lsn = wait_for_last_flush_lsn(
-        env, endpoint_main, env.initial_tenant, new_timeline_id
-    )
-
-    with env.pageserver.http_client() as client:
-        # Check edge cases: lsn larger than the last flush lsn
-        probe_lsn = Lsn(int(last_flush_lsn) * 20 + 80_000)
-        result = client.timeline_get_timestamp_of_lsn(
-            env.initial_tenant,
-            new_timeline_id,
-            probe_lsn,
-        )
-
-        # lsn of zero
-        try:
-            probe_lsn = Lsn(0)
-            result = client.timeline_get_timestamp_of_lsn(
-                env.initial_tenant,
-                new_timeline_id,
-                probe_lsn,
-            )
-            # There should always be an error here.
-            raise RuntimeError("there should have been an 'Invalid LSN' error")
-        except PageserverApiException as error:
-            assert error.status_code == 500
-            assert str(error) == "Invalid LSN"
-            env.pageserver.allowed_errors.append(".*Invalid LSN.*")
-
-        # small lsn before initdb_lsn
-        try:
-            probe_lsn = Lsn(64)
-            result = client.timeline_get_timestamp_of_lsn(
-                env.initial_tenant,
-                new_timeline_id,
-                probe_lsn,
-            )
-            # There should always be an error here.
-            raise RuntimeError("there should have been an 'could not find data for key' error")
-        except PageserverApiException as error:
-            assert error.status_code == 500
-            assert str(error).startswith("could not find data for key")
-            env.pageserver.allowed_errors.append(".*could not find data for key.*")
-
-        # Probe a bunch of timestamps in the valid range
-        step_size = 100
-        for i in range(step_size, len(tbl), step_size):
-            after_timestamp = tbl[i][1]
-            after_lsn = tbl[i][2]
-            result = client.timeline_get_timestamp_of_lsn(
-                env.initial_tenant,
-                new_timeline_id,
-                after_lsn,
-            )
-            log.info("result: %s, after_ts: %s", result, after_timestamp)
-
-            # TODO use fromisoformat once we have Python 3.11+
-            # which has https://github.com/python/cpython/pull/92177
-            timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace(
-                tzinfo=timezone.utc
-            )
-            assert timestamp < after_timestamp, "after_timestamp after timestamp"
-            if i > 1:
-                before_timestamp = tbl[i - step_size][1]
-                assert timestamp >= before_timestamp, "before_timestamp before timestamp"
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -306,10 +306,12 @@ def test_ondemand_download_timetravel(
 #
 # Ensure that the `download_remote_layers` API works
 #
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_download_remote_layers_api(
    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
 ):
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    ##### First start, insert data and upload it to the remote storage
    env = neon_env_builder.init_start(
@@ -463,11 +465,14 @@ def test_download_remote_layers_api(
        assert query_scalar(cur, "select count(*) from testtab") == table_len


-def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_without_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
    """
    Create a few layers, then evict, then make sure compaction runs successfully.
    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    conf = {
        # Disable background GC & compaction
@@ -542,14 +547,17 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
    assert post_compact[1] >= 3, "should had downloaded the three layers"


-def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_with_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
    """
    Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.

    Due to current implementation, this will make image creation on-demand download layers, but we cannot really
    directly test for it.
    """
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    conf = {
        # Disable background GC & compaction
@@ -637,14 +645,17 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
    assert dict(kinds_after) == {"Delta": 4, "Image": 1}


-def test_ondemand_download_failure_to_replace(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
    """
    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.

    See: https://github.com/neondatabase/neon/issues/3533
    """

-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

    # disable gc and compaction via default tenant config because config is lost while detaching
    # so that compaction will not be the one to download the layer but the http handler is
--- a/Show More
+++ b/Show More