wip

attachment_service: graceful SIGQUIT (#5626 )
2026-05-21 23:20:40 +00:00 · 2023-10-27 11:42:18 -04:00 · 2023-10-24 13:11:40 -04:00 · 2023-10-23 17:30:25 +01:00 · 2023-10-23 17:51:38 +02:00 · 2023-10-23 15:32:34 +01:00
74 changed files with 4358 additions and 1696 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -320,6 +320,9 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -32,7 +32,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -90,18 +90,21 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(nproc)
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(nproc)
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(nproc)
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(nproc)
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
        run: cargo build --all --release
@@ -126,7 +129,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -135,6 +138,9 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(nproc)
+
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,7 @@ dependencies = [
 "log",
 "parking",
 "polling",
- "rustix 0.37.19",
+ "rustix 0.37.25",
 "slab",
 "socket2 0.4.9",
 "waker-fn",
@@ -2582,7 +2582,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
 "hermit-abi",
 "io-lifetimes",
- "rustix 0.37.19",
+ "rustix 0.37.25",
 "windows-sys 0.48.0",
 ]

@@ -3561,7 +3561,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3574,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3585,7 +3585,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3603,7 +3603,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4331,9 +4331,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.37.19"
+version = "0.37.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
+checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
 dependencies = [
 "bitflags",
 "errno",
@@ -5174,7 +5174,7 @@ dependencies = [
 "cfg-if",
 "fastrand 1.9.0",
 "redox_syscall 0.3.5",
- "rustix 0.37.19",
+ "rustix 0.37.25",
 "windows-sys 0.45.0",
 ]

@@ -5407,7 +5407,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d#a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6092,6 +6092,17 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "walproposer"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "postgres_ffi",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "want"
 version = "0.3.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/walproposer",
 ]

 [workspace.package]
@@ -160,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -185,6 +186,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -200,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="a2d0652ec3f8f710ff8cfc2e7c68f096fb852d9d" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/38
+++ b/38
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -168,6 +168,42 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

+# Build walproposer as a static library. walproposer source code is located
+# in the pgxn/neon directory.
+# 
+# We also need to include libpgport.a and libpgcommon.a, because walproposer
+# uses some functions from those libraries.
+# 
+# Some object files are removed from libpgport.a and libpgcommon.a because
+# they depend on openssl and other libraries that are not included in our
+# Rust build.
+.PHONY: walproposer-lib
+walproposer-lib: neon-pg-ext-v16
+	+@echo "Compiling walproposer-lib"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+		pg_strong_random.o
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o \
+		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
+		md5_common.o \
+		checksum_helper.o
+endif
+
+.PHONY: walproposer-lib-clean
+walproposer-lib-clean:
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -302,7 +302,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -36,7 +36,7 @@ use utils::pid_file::{self, PidFileRead};
 // it's waiting. If the process hasn't started/stopped after 5 seconds,
 // it prints a notice that it's taking long, but keeps waiting.
 //
-const RETRY_UNTIL_SECS: u64 = 10;
+const RETRY_UNTIL_SECS: u64 = 10000;
 const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
 const RETRY_INTERVAL_MILLIS: u64 = 100;
 const DOT_EVERY_RETRIES: u64 = 10;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -268,7 +269,16 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;

    Ok(())
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,6 +110,7 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,8 +23,8 @@ use tracing::debug;

 use crate::s3_bucket::RequestKind;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
+    StorageMetadata,
 };

 pub struct AzureBlobStorage {
@@ -121,22 +121,7 @@ impl AzureBlobStorage {
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
-            let part = match part {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
+            let part = part.map_err(to_download_error)?;
            let data = part
                .data
                .collect()
@@ -157,30 +142,16 @@ impl AzureBlobStorage {
    ) -> Result<StorageMetadata, DownloadError> {
        let builder = blob_client.get_metadata();

-        match builder.into_future().await {
-            Ok(r) => {
-                let mut map = HashMap::new();
+        let response = builder.into_future().await.map_err(to_download_error)?;
+        let mut map = HashMap::new();

-                for md in r.metadata.iter() {
-                    map.insert(
-                        md.name().as_str().to_string(),
-                        md.value().as_str().to_string(),
-                    );
-                }
-                Ok(StorageMetadata(map))
-            }
-            Err(e) => {
-                return Err(if let Some(http_err) = e.as_http_error() {
-                    match http_err.status() {
-                        StatusCode::NotFound => DownloadError::NotFound,
-                        StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
-                        _ => DownloadError::Other(anyhow::Error::new(e)),
-                    }
-                } else {
-                    DownloadError::Other(e.into())
-                });
-            }
+        for md in response.metadata.iter() {
+            map.insert(
+                md.name().as_str().to_string(),
+                md.value().as_str().to_string(),
+            );
        }
+        Ok(StorageMetadata(map))
    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
@@ -199,13 +170,24 @@ fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
    res
 }

+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_name(p))
@@ -213,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut builder = self.client.list_blobs();
-
-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+        let mut builder = self
+            .client
+            .list_blobs()
+            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());

        if let Some(prefix) = list_prefix {
            builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -236,39 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
        }

        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        while let Some(l) = response.next().await {
-            let entry = match l {
-                Ok(l) => l,
-                Err(e) => {
-                    return Err(if let Some(http_err) = e.as_http_error() {
-                        match http_err.status() {
-                            StatusCode::NotFound => DownloadError::NotFound,
-                            StatusCode::BadRequest => {
-                                DownloadError::BadInput(anyhow::Error::new(e))
-                            }
-                            _ => DownloadError::Other(anyhow::Error::new(e)),
-                        }
-                    } else {
-                        DownloadError::Other(e.into())
-                    });
-                }
-            };
-
-            let prefix_iter = entry
+        let mut res = Vec::new();
+        while let Some(entry) = response.next().await {
+            let entry = entry.map_err(to_download_error)?;
+            let name_iter = entry
                .blobs
                .prefixes()
                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
-
-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-            res.keys.extend(blob_iter);
+            res.extend(name_iter);
        }
        Ok(res)
    }
+
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone());
+
+        let mut builder = self.client.list_blobs();
+
+        if let Some(folder_name) = folder_name {
+            builder = builder.prefix(Cow::from(folder_name.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Vec::new();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(anyhow::Error::new)?;
+            let name_iter = entry
+                .blobs
+                .blobs()
+                .map(|bl| self.name_to_relative_path(&bl.name));
+            res.extend(name_iter);
+        }
+        Ok(res)
+    }
+
    async fn upload(
        &self,
        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -129,22 +129,6 @@ impl RemotePath {
    }
 }

-/// We don't need callers to be able to pass arbitrary delimiters: just control
-/// whether listings will use a '/' separator or not.
-///
-/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
-/// NoDelimiter mode will only populate `keys`.
-pub enum ListingMode {
-    WithDelimiter,
-    NoDelimiter,
-}
-
-#[derive(Default)]
-pub struct Listing {
-    pub prefixes: Vec<RemotePath>,
-    pub keys: Vec<RemotePath>,
-}
-
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -157,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
+    ) -> Result<Vec<RemotePath>, DownloadError>;
+
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -175,21 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
-        Ok(result)
-    }
-
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>; /* {
-                                                     // XXX Placeholder impl.
-                                                     let mut result = Listing::default();
-                                                     result.prefixes = self.list_prefixes(prefix).await?;
-                                                     Ok(result)
-                                                 }*/
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -240,9 +205,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -253,7 +215,6 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -273,19 +234,6 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
-    pub async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list(prefix, mode).await,
-            Self::AwsS3(s) => s.list(prefix, mode).await,
-            Self::AzureBlob(s) => s.list(prefix, mode).await,
-            Self::Unreliable(s) => s.list(prefix, mode).await,
-        }
-    }
-
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -75,7 +75,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,10 +89,52 @@ impl LocalFs {
            })
            .collect())
    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    ),
+            )
+        }
+
+        Ok(prefixes)
+    }

    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -144,61 +186,6 @@ impl LocalFs {

        Ok(files)
    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            result.keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped)
-            }
-        }
-
-        Ok(result)
-    }

    async fn upload(
        &self,
@@ -492,7 +479,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list_all().await?,
+            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -680,7 +667,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list_all().await?.is_empty());
+        assert!(storage.list().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -790,7 +777,7 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list_all().await?;
+        let mut files = storage.list().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,8 +30,8 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -299,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(
+    /// See the doc for `RemoteStorage::list_prefixes`
+    /// Note: it wont include empty "directories"
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        let kind = RequestKind::List;
-        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -314,33 +314,28 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

+        let mut document_keys = Vec::new();
+
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let mut request = self
+            let fetch_response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
-
-            if let ListingMode::WithDelimiter = mode {
-                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-            }
-
-            let response = request
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
+                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -350,35 +345,71 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
+                .observe_elapsed(kind, &fetch_response, started_at);

-            let response = response?;
+            let fetch_response = fetch_response?;

-            let keys = response.contents().unwrap_or_default();
-            let empty = Vec::new();
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-
-            for object in keys {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                result.keys.push(remote_path);
-            }
-
-            result.prefixes.extend(
-                prefixes
-                    .iter()
+            document_keys.extend(
+                fetch_response
+                    .common_prefixes
+                    .unwrap_or_default()
+                    .into_iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match response.next_continuation_token {
+            continuation_token = match fetch_response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(result)
+        Ok(document_keys)
+    }
+
+    /// See the doc for `RemoteStorage::list_files`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
+        let folder_name = folder
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        // AWS may need to break the response into several parts
+        let mut continuation_token = None;
+        let mut all_files = vec![];
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);
+
+            let response = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(folder_name.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(self.max_keys_per_list_response)
+                .send()
+                .await
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;
+
+            for object in response.contents().unwrap_or_default() {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                all_files.push(remote_path);
+            }
+            match response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+        Ok(all_files)
    }

    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-};
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
-        self.inner.list(prefix, mode).await
-    }
-
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,36 +1,12 @@
-use std::sync::{atomic::AtomicI32, Arc};
+use std::sync::Arc;

 use tokio::sync::{mpsc, Mutex};

 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
-pub struct Completion {
-    sender: mpsc::Sender<()>,
-    refcount: Arc<AtomicI32>,
-}
-
-impl Clone for Completion {
-    fn clone(&self) -> Self {
-        let i = self
-            .refcount
-            .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
-        tracing::info!("Completion::clone[{:p}]: {i}", &(*self.refcount));
-        Self {
-            sender: self.sender.clone(),
-            refcount: self.refcount.clone(),
-        }
-    }
-}
-
-impl Drop for Completion {
-    fn drop(&mut self) {
-        let i = self
-            .refcount
-            .fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
-        tracing::info!("Completion::drop[{:p}]: {i}", &(*self.refcount));
-    }
-}
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
@@ -69,11 +45,5 @@ pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
    let rx = Mutex::new(rx);
    let rx = Arc::new(rx);
-    (
-        Completion {
-            sender: tx,
-            refcount: Arc::new(AtomicI32::new(1)),
-        },
-        Barrier(rx),
-    )
+    (Completion(tx), Barrier(rx))
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -253,11 +253,22 @@ impl Runner {
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
            // TODO: make the duration here configurable.
            if last_time.elapsed() > Duration::from_secs(5) {
                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
            } else if last_history.samples_count <= 1 {
-                bail!("haven't received enough cgroup memory stats yet");
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
            }

            let new_threshold = self
@@ -505,11 +516,14 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "walproposer"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+utils.workspace = true
+postgres_ffi.workspace = true
+
+workspace_hack.workspace = true
+
+[build-dependencies]
+anyhow.workspace = true
+bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -0,0 +1 @@
+#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -0,0 +1,113 @@
+use std::{env, path::PathBuf, process::Command};
+
+use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;
+
+fn main() -> anyhow::Result<()> {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed=bindgen_deps.h");
+
+    // Finding the location of built libraries and Postgres C headers:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
+    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
+        postgres_install_dir.into()
+    } else {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
+    };
+
+    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
+    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
+    let walproposer_lib_search_str = walproposer_lib_dir
+        .to_str()
+        .ok_or(anyhow!("Bad non-UTF path"))?;
+
+    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
+    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
+    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
+
+    println!("cargo:rustc-link-lib=static=pgport");
+    println!("cargo:rustc-link-lib=static=pgcommon");
+    println!("cargo:rustc-link-lib=static=walproposer");
+    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
+
+    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .context("failed to execute `pg_config --includedir-server`")?;
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout)
+            .context("pg_config output is not UTF-8")?
+            .trim_end()
+            .into()
+    } else {
+        let server_path = pg_install_abs
+            .join("v16")
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string();
+        server_path
+            .into_string()
+            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
+    };
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header("bindgen_deps.h")
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(CargoCallbacks))
+        .allowlist_type("WalProposer")
+        .allowlist_type("WalProposerConfig")
+        .allowlist_type("walproposer_api")
+        .allowlist_function("WalProposerCreate")
+        .allowlist_function("WalProposerStart")
+        .allowlist_function("WalProposerBroadcast")
+        .allowlist_function("WalProposerPoll")
+        .allowlist_function("WalProposerFree")
+        .allowlist_var("DEBUG5")
+        .allowlist_var("DEBUG4")
+        .allowlist_var("DEBUG3")
+        .allowlist_var("DEBUG2")
+        .allowlist_var("DEBUG1")
+        .allowlist_var("LOG")
+        .allowlist_var("INFO")
+        .allowlist_var("NOTICE")
+        .allowlist_var("WARNING")
+        .allowlist_var("ERROR")
+        .allowlist_var("FATAL")
+        .allowlist_var("PANIC")
+        .allowlist_var("WPEVENT")
+        .allowlist_var("WL_LATCH_SET")
+        .allowlist_var("WL_SOCKET_READABLE")
+        .allowlist_var("WL_SOCKET_WRITEABLE")
+        .allowlist_var("WL_TIMEOUT")
+        .allowlist_var("WL_SOCKET_CLOSED")
+        .allowlist_var("WL_SOCKET_MASK")
+        .clang_arg("-DWALPROPOSER_LIB")
+        .clang_arg(format!("-I{pgxn_neon}"))
+        .clang_arg(format!("-I{inc_server_path}"))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings!");
+
+    Ok(())
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -0,0 +1,455 @@
+#![allow(dead_code)]
+
+use std::ffi::CStr;
+use std::ffi::CString;
+
+use crate::bindings::uint32;
+use crate::bindings::walproposer_api;
+use crate::bindings::PGAsyncReadResult;
+use crate::bindings::PGAsyncWriteResult;
+use crate::bindings::Safekeeper;
+use crate::bindings::Size;
+use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
+use crate::bindings::TimestampTz;
+use crate::bindings::WalProposer;
+use crate::bindings::WalProposerConnStatusType;
+use crate::bindings::WalProposerConnectPollStatusType;
+use crate::bindings::WalProposerExecStatusType;
+use crate::bindings::WalproposerShmemState;
+use crate::bindings::XLogRecPtr;
+use crate::walproposer::ApiImpl;
+use crate::walproposer::WaitResult;
+
+extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_shmem_state()
+    }
+}
+
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).start_streaming(startpos)
+    }
+}
+
+extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_flush_rec_ptr()
+    }
+}
+
+extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_current_timestamp()
+    }
+}
+
+extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let msg = (*api).conn_error_message(&mut (*sk));
+        let msg = CString::new(msg).unwrap();
+        // TODO: fix leaking error message
+        msg.into_raw()
+    }
+}
+
+extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_status(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_start(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_poll(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
+    let query = unsafe { CStr::from_ptr(query) };
+    let query = query.to_str().unwrap();
+
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_send_query(&mut (*sk), query)
+    }
+}
+
+extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_get_query_result(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_flush(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_finish(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_finish(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_async_read(
+    sk: *mut Safekeeper,
+    buf: *mut *mut ::std::os::raw::c_char,
+    amount: *mut ::std::os::raw::c_int,
+) -> PGAsyncReadResult {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let (res, result) = (*api).conn_async_read(&mut (*sk));
+
+        // This function has guarantee that returned buf will be valid until
+        // the next call. So we can store a Vec in each Safekeeper and reuse
+        // it on the next call.
+        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
+
+        inbuf.clear();
+        inbuf.extend_from_slice(res);
+
+        // Put a Vec back to sk->inbuf and return data ptr.
+        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
+        *amount = res.len() as i32;
+
+        result
+    }
+}
+
+extern "C" fn conn_async_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> PGAsyncWriteResult {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_async_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn conn_blocking_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_blocking_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_allocate(&mut (*sk));
+    }
+}
+
+extern "C" fn free_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).free_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn init_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).init_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).update_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).add_safekeeper_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn wait_event_set(
+    wp: *mut WalProposer,
+    timeout: ::std::os::raw::c_long,
+    event_sk: *mut *mut Safekeeper,
+    events: *mut uint32,
+) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let result = (*api).wait_event_set(&mut (*wp), timeout);
+        match result {
+            WaitResult::Latch => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_LATCH_SET;
+                1
+            }
+            WaitResult::Timeout => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_TIMEOUT;
+                0
+            }
+            WaitResult::Network(sk, event_mask) => {
+                *event_sk = sk;
+                *events = event_mask;
+                1
+            }
+        }
+    }
+}
+
+extern "C" fn strong_random(
+    wp: *mut WalProposer,
+    buf: *mut ::std::os::raw::c_void,
+    len: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).strong_random(buf)
+    }
+}
+
+extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_redo_start_lsn()
+    }
+}
+
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).finish_sync_safekeepers(lsn)
+    }
+}
+
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+    }
+}
+
+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
+extern "C" fn log_internal(
+    wp: *mut WalProposer,
+    level: ::std::os::raw::c_int,
+    line: *const ::std::os::raw::c_char,
+) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let line = CStr::from_ptr(line);
+        let line = line.to_str().unwrap();
+        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
+    }
+}
+
+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
+#[derive(Debug)]
+pub enum Level {
+    Debug5,
+    Debug4,
+    Debug3,
+    Debug2,
+    Debug1,
+    Log,
+    Info,
+    Notice,
+    Warning,
+    Error,
+    Fatal,
+    Panic,
+    WPEvent,
+}
+
+impl Level {
+    pub fn from(elevel: u32) -> Level {
+        use crate::bindings::*;
+
+        match elevel {
+            DEBUG5 => Level::Debug5,
+            DEBUG4 => Level::Debug4,
+            DEBUG3 => Level::Debug3,
+            DEBUG2 => Level::Debug2,
+            DEBUG1 => Level::Debug1,
+            LOG => Level::Log,
+            INFO => Level::Info,
+            NOTICE => Level::Notice,
+            WARNING => Level::Warning,
+            ERROR => Level::Error,
+            FATAL => Level::Fatal,
+            PANIC => Level::Panic,
+            WPEVENT => Level::WPEvent,
+            _ => panic!("unknown log level {}", elevel),
+        }
+    }
+}
+
+pub(crate) fn create_api() -> walproposer_api {
+    walproposer_api {
+        get_shmem_state: Some(get_shmem_state),
+        start_streaming: Some(start_streaming),
+        get_flush_rec_ptr: Some(get_flush_rec_ptr),
+        get_current_timestamp: Some(get_current_timestamp),
+        conn_error_message: Some(conn_error_message),
+        conn_status: Some(conn_status),
+        conn_connect_start: Some(conn_connect_start),
+        conn_connect_poll: Some(conn_connect_poll),
+        conn_send_query: Some(conn_send_query),
+        conn_get_query_result: Some(conn_get_query_result),
+        conn_flush: Some(conn_flush),
+        conn_finish: Some(conn_finish),
+        conn_async_read: Some(conn_async_read),
+        conn_async_write: Some(conn_async_write),
+        conn_blocking_write: Some(conn_blocking_write),
+        recovery_download: Some(recovery_download),
+        wal_read: Some(wal_read),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
+        init_event_set: Some(init_event_set),
+        update_event_set: Some(update_event_set),
+        add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        wait_event_set: Some(wait_event_set),
+        strong_random: Some(strong_random),
+        get_redo_start_lsn: Some(get_redo_start_lsn),
+        finish_sync_safekeepers: Some(finish_sync_safekeepers),
+        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
+        log_internal: Some(log_internal),
+        after_election: Some(after_election),
+    }
+}
+
+impl std::fmt::Display for Level {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Take ownership of `Vec<u8>` from StringInfoData.
+pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
+    if pg.data.is_null() {
+        return None;
+    }
+
+    let ptr = pg.data as *mut u8;
+    let length = pg.len as usize;
+    let capacity = pg.maxlen as usize;
+
+    pg.data = std::ptr::null_mut();
+    pg.len = 0;
+    pg.maxlen = 0;
+
+    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
+}
+
+/// Store `Vec<u8>` in StringInfoData.
+fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
+    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
+    let length = vec.len();
+    let capacity = vec.capacity();
+
+    assert!(pg.data.is_null());
+
+    pg.data = ptr;
+    pg.len = length as i32;
+    pg.maxlen = capacity as i32;
+
+    std::mem::forget(vec);
+
+    ptr
+}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -0,0 +1,14 @@
+pub mod bindings {
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    // bindgen creates some unsafe code with no doc comments.
+    #![allow(clippy::missing_safety_doc)]
+    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+    #![allow(clippy::useless_transmute)]
+
+    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+}
+
+pub mod api_bindings;
+pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -0,0 +1,485 @@
+use std::ffi::CString;
+
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::id::TenantTimelineId;
+
+use crate::{
+    api_bindings::{create_api, take_vec_u8, Level},
+    bindings::{
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
+    },
+};
+
+/// Rust high-level wrapper for C walproposer API. Many methods are not required
+/// for simple cases, hence todo!() in default implementations.
+///
+/// Refer to `pgxn/neon/walproposer.h` for documentation.
+pub trait ApiImpl {
+    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+        todo!()
+    }
+
+    fn start_streaming(&self, _startpos: u64) {
+        todo!()
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        todo!()
+    }
+
+    fn get_current_timestamp(&self) -> i64 {
+        todo!()
+    }
+
+    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
+        todo!()
+    }
+
+    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
+        todo!()
+    }
+
+    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerConnectPollStatusType {
+        todo!()
+    }
+
+    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
+        todo!()
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerExecStatusType {
+        todo!()
+    }
+
+    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
+        todo!()
+    }
+
+    fn conn_finish(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+        todo!()
+    }
+
+    fn conn_async_write(
+        &self,
+        _sk: &mut Safekeeper,
+        _buf: &[u8],
+    ) -> crate::bindings::PGAsyncWriteResult {
+        todo!()
+    }
+
+    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
+        todo!()
+    }
+
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+        todo!()
+    }
+
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+        todo!()
+    }
+
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn free_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn init_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
+        todo!()
+    }
+
+    fn strong_random(&self, _buf: &mut [u8]) -> bool {
+        todo!()
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        todo!()
+    }
+
+    fn finish_sync_safekeepers(&self, _lsn: u64) {
+        todo!()
+    }
+
+    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+        todo!()
+    }
+
+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
+    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
+        todo!()
+    }
+
+    fn after_election(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+}
+
+pub enum WaitResult {
+    Latch,
+    Timeout,
+    Network(*mut Safekeeper, u32),
+}
+
+pub struct Config {
+    /// Tenant and timeline id
+    pub ttid: TenantTimelineId,
+    /// List of safekeepers in format `host:port`
+    pub safekeepers_list: Vec<String>,
+    /// Safekeeper reconnect timeout in milliseconds
+    pub safekeeper_reconnect_timeout: i32,
+    /// Safekeeper connection timeout in milliseconds
+    pub safekeeper_connection_timeout: i32,
+    /// walproposer mode, finish when all safekeepers are synced or subscribe
+    /// to WAL streaming
+    pub sync_safekeepers: bool,
+}
+
+/// WalProposer main struct. C methods are reexported as Rust functions.
+pub struct Wrapper {
+    wp: *mut WalProposer,
+    _safekeepers_list_vec: Vec<u8>,
+}
+
+impl Wrapper {
+    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
+        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
+            .unwrap()
+            .into_raw();
+        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
+            .unwrap()
+            .into_raw();
+
+        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
+            .unwrap()
+            .into_bytes_with_nul();
+        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+
+        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
+
+        let c_config = WalProposerConfig {
+            neon_tenant,
+            neon_timeline,
+            safekeepers_list,
+            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
+            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
+            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
+            syncSafekeepers: config.sync_safekeepers,
+            systemId: 0,
+            pgTimeline: 1,
+            callback_data,
+        };
+        let c_config = Box::into_raw(Box::new(c_config));
+
+        let api = create_api();
+        let wp = unsafe { WalProposerCreate(c_config, api) };
+        Wrapper {
+            wp,
+            _safekeepers_list_vec: safekeepers_list_vec,
+        }
+    }
+
+    pub fn start(&self) {
+        unsafe { WalProposerStart(self.wp) }
+    }
+}
+
+impl Drop for Wrapper {
+    fn drop(&mut self) {
+        unsafe {
+            let config = (*self.wp).config;
+            drop(Box::from_raw(
+                (*config).callback_data as *mut Box<dyn ApiImpl>,
+            ));
+            drop(CString::from_raw((*config).neon_tenant));
+            drop(CString::from_raw((*config).neon_timeline));
+            drop(Box::from_raw(config));
+
+            for i in 0..(*self.wp).n_safekeepers {
+                let sk = &mut (*self.wp).safekeeper[i as usize];
+                take_vec_u8(&mut sk.inbuf);
+            }
+
+            WalProposerFree(self.wp);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        cell::Cell,
+        sync::{atomic::AtomicUsize, mpsc::sync_channel},
+    };
+
+    use utils::id::TenantTimelineId;
+
+    use crate::{api_bindings::Level, walproposer::Wrapper};
+
+    use super::ApiImpl;
+
+    #[derive(Clone, Copy, Debug)]
+    struct WaitEventsData {
+        sk: *mut crate::bindings::Safekeeper,
+        event_mask: u32,
+    }
+
+    struct MockImpl {
+        // data to return from wait_event_set
+        wait_events: Cell<WaitEventsData>,
+        // walproposer->safekeeper messages
+        expected_messages: Vec<Vec<u8>>,
+        expected_ptr: AtomicUsize,
+        // safekeeper->walproposer messages
+        safekeeper_replies: Vec<Vec<u8>>,
+        replies_ptr: AtomicUsize,
+        // channel to send LSN to the main thread
+        sync_channel: std::sync::mpsc::SyncSender<u64>,
+    }
+
+    impl MockImpl {
+        fn check_walproposer_msg(&self, msg: &[u8]) {
+            let ptr = self
+                .expected_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.expected_messages.len() {
+                panic!("unexpected message from walproposer");
+            }
+
+            let expected_msg = &self.expected_messages[ptr];
+            assert_eq!(msg, expected_msg.as_slice());
+        }
+
+        fn next_safekeeper_reply(&self) -> &[u8] {
+            let ptr = self
+                .replies_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.safekeeper_replies.len() {
+                panic!("no more safekeeper replies");
+            }
+
+            &self.safekeeper_replies[ptr]
+        }
+    }
+
+    impl ApiImpl for MockImpl {
+        fn get_current_timestamp(&self) -> i64 {
+            println!("get_current_timestamp");
+            0
+        }
+
+        fn conn_status(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnStatusType {
+            println!("conn_status");
+            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+
+        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("conn_connect_start");
+        }
+
+        fn conn_connect_poll(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnectPollStatusType {
+            println!("conn_connect_poll");
+            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+        }
+
+        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
+            println!("conn_send_query: {}", query);
+            true
+        }
+
+        fn conn_get_query_result(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerExecStatusType {
+            println!("conn_get_query_result");
+            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+        }
+
+        fn conn_async_read(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            println!("conn_async_read");
+            let reply = self.next_safekeeper_reply();
+            println!("conn_async_read result: {:?}", reply);
+            (
+                reply,
+                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
+            )
+        }
+
+        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
+            println!("conn_blocking_write: {:?}", buf);
+            self.check_walproposer_msg(buf);
+            true
+        }
+
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
+        }
+
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
+        }
+
+        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("init_event_set")
+        }
+
+        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "update_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn wait_event_set(
+            &self,
+            _: &mut crate::bindings::WalProposer,
+            timeout_millis: i64,
+        ) -> super::WaitResult {
+            let data = self.wait_events.get();
+            println!(
+                "wait_event_set, timeout_millis={}, res={:?}",
+                timeout_millis, data
+            );
+            super::WaitResult::Network(data.sk, data.event_mask)
+        }
+
+        fn strong_random(&self, buf: &mut [u8]) -> bool {
+            println!("strong_random");
+            buf.fill(0);
+            true
+        }
+
+        fn finish_sync_safekeepers(&self, lsn: u64) {
+            self.sync_channel.send(lsn).unwrap();
+            panic!("sync safekeepers finished at lsn={}", lsn);
+        }
+
+        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
+            println!("walprop_log[{}] {}", level, msg);
+        }
+
+        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
+            println!("after_election");
+        }
+    }
+
+    /// Test that walproposer can successfully connect to safekeeper and finish
+    /// sync_safekeepers. API is mocked in MockImpl.
+    ///
+    /// Run this test with valgrind to detect leaks:
+    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
+    #[test]
+    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
+        let ttid = TenantTimelineId::new(
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+        );
+
+        let (sender, receiver) = sync_channel(1);
+
+        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
+            wait_events: Cell::new(WaitEventsData {
+                sk: std::ptr::null_mut(),
+                event_mask: 0,
+            }),
+            expected_messages: vec![
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            expected_ptr: AtomicUsize::new(0),
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            replies_ptr: AtomicUsize::new(0),
+            sync_channel: sender,
+        });
+        let config = crate::walproposer::Config {
+            ttid,
+            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 10000,
+            sync_safekeepers: true,
+        };
+
+        let wp = Wrapper::new(my_impl, config);
+
+        // walproposer will panic when it finishes sync_safekeepers
+        std::panic::catch_unwind(|| wp.start()).unwrap_err();
+        // validate the resulting LSN
+        assert_eq!(receiver.recv()?, 1337);
+        Ok(())
+        // drop() will free up resources here
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,6 +2,7 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
+use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
@@ -200,6 +201,51 @@ fn initialize_config(
    })
 }

+struct WaitForPhaseResult<F: std::future::Future + Unpin> {
+    timeout_remaining: Duration,
+    skipped: Option<F>,
+}
+
+/// During startup, we apply a timeout to our waits for readiness, to avoid
+/// stalling the whole service if one Tenant experiences some problem.  Each
+/// phase may consume some of the timeout: this function returns the updated
+/// timeout for use in the next call.
+async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
+where
+    F: std::future::Future + Unpin,
+{
+    let initial_t = Instant::now();
+    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
+        Ok(_) => None,
+        Err(_) => {
+            tracing::info!(
+                timeout_millis = timeout.as_millis(),
+                %phase,
+                "Startup phase timed out, proceeding anyway"
+            );
+            Some(fut)
+        }
+    };
+
+    WaitForPhaseResult {
+        timeout_remaining: timeout
+            .checked_sub(Instant::now().duration_since(initial_t))
+            .unwrap_or(Duration::ZERO),
+        skipped,
+    }
+}
+
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+    let elapsed = started_at.elapsed();
+    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "{human_phase} ({secs:.3}s since start)"
+    )
+}
+
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -207,16 +253,6 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -341,7 +377,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
+    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -361,18 +397,12 @@ fn start_pageserver(
    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();

    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
-    let (tenants_can_start, tenants_can_start_barrier) = utils::completion::channel();
-
-    tracing::info!("init_remote_done_tx:");
-    let c = init_remote_done_tx.clone();
-    drop(c);

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
-        tenants_can_start: tenants_can_start_barrier.clone(),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

@@ -394,60 +424,93 @@ fn start_pageserver(
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial load completed")
+            });

-            init_remote_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");
+            let timeout = conf.background_task_maximum_delay;

-            drop(tenants_can_start);
+            let init_remote_done = std::pin::pin!(async {
+                init_remote_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load_remote",
+                    "Remote part of initial load completed",
+                );
+            });

-            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_remote_skipped,
+            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
+
+            let init_load_done = std::pin::pin!(async {
+                init_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load",
+                    "Initial load completed",
+                );
+                STARTUP_IS_LOADING.set(0);
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_load_skipped,
+            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });

-            let timeout = conf.background_task_maximum_delay;
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });

-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-
-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
-                    None
-                }
-                Err(_) => {
-                    tracing::info!(
-                        timeout_millis = timeout.as_millis(),
-                        "Initial logical size timeout elapsed; starting background jobs"
-                    );
-                    Some(init_sizes_done)
-                }
-            };
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start
+            // allow background jobs to start: we either completed prior stages, or they reached timeout
+            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
+            // because things like consumption metrics for billing are blocked by this barrier.
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");
-
-            if let Some(init_sizes_done) = init_sizes_done {
-                // ending up here is not a bug; at the latest logical sizes will be queried by
-                // consumption metrics.
-                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-                init_sizes_done.await;
-
-                scopeguard::ScopeGuard::into_inner(guard);
-
-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+            startup_checkpoint(
+                started_startup_at,
+                "background_jobs_can_start",
+                "Starting background jobs",
+            );

+            // We are done. If we skipped any phases due to timeout, run them to completion here so that
+            // they will eventually update their startup_checkpoint, and so that we do not declare the
+            // 'complete' stage until all the other stages are really done.
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before waiting for skipped phases done")
+            });
+            if let Some(f) = init_remote_skipped {
+                f.await;
            }
+            if let Some(f) = init_load_skipped {
+                f.await;
+            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
+            scopeguard::ScopeGuard::into_inner(guard);

-            startup_checkpoint("complete", "Startup complete");
+            startup_checkpoint(started_startup_at, "complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -632,6 +633,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }

+    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_ATTACHING_MARKER_FILENAME)
+    }
+
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -11,6 +11,7 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -88,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -142,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -186,8 +186,6 @@ pub struct InitializationOrder {
    /// attempt. It is important to drop this once the attempt has completed.
    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

-    pub tenants_can_start: utils::completion::Barrier,
-
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -25,9 +25,11 @@ use super::{
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
 };

+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
@@ -58,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }

 async fn create_remote_delete_mark(
@@ -237,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -248,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
+/// There are three entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
+/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -349,7 +378,7 @@ impl DeleteTenantFlow {

    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
+        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -360,24 +389,66 @@ impl DeleteTenantFlow {
            )
        };

-        if remote_mark_exists {
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
            return Ok(acquire(tenant));
        }

-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        match tokio::fs::metadata(conf.tenant_deleted_mark_file_path(&tenant_id)).await {
-            Ok(_) => Ok(acquire(tenant)),
-            Err(_) => Ok(None),
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
        }
    }

+    pub(crate) async fn resume_from_load(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true, false)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, None, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -388,7 +459,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(init_order, preload, ctx)
+            .attach(ctx, super::AttachMarkerMode::Expect)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,10 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
+    TenantState,
+};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -329,12 +332,7 @@ async fn init_load_tenant_configs(
            .read_dir_utf8()
            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;

-        let mut result = Vec::new();
-        for dentry in dir_entries {
-            result.push(dentry?);
-        }
-
-        Ok(result)
+        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
    })
    .await??;

@@ -346,10 +344,9 @@ async fn init_load_tenant_configs(
    }

    while let Some(r) = join_set.join_next().await {
-        match r?? {
-            Some((tenant_id, tenant_config)) => configs.insert(tenant_id, tenant_config),
-            None => None,
-        };
+        if let Some((tenant_id, tenant_config)) = r?? {
+            configs.insert(tenant_id, tenant_config);
+        }
    }

    Ok(configs)
@@ -499,24 +496,45 @@ pub(crate) fn schedule_local_tenant_processing(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!("Attaching tenant {tenant_id}");
-    let tenant = match Tenant::spawn(
-        conf,
-        tenant_id,
-        resources,
-        location_conf,
-        init_order,
-        tenants,
-        SpawnMode::Normal,
-        ctx,
-    ) {
-        Ok(tenant) => tenant,
-        Err(e) => {
-            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
+        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
+        if resources.remote_storage.is_none() {
+            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
+        } else {
+            match Tenant::spawn_attach(
+                conf,
+                tenant_id,
+                resources,
+                location_conf,
+                tenants,
+                AttachMarkerMode::Expect,
+                ctx,
+            ) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        }
+    } else {
+        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
+        // Start loading the tenant into memory. It will initially be in Loading state.
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            location_conf,
+            resources,
+            init_order,
+            tenants,
+            ctx,
+        )
    };
-
    Ok(tenant)
 }

@@ -658,13 +676,13 @@ pub(crate) async fn create_tenant(
        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
-
+        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let created_tenant = Tenant::spawn(conf, tenant_id, resources,
-        AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Create, ctx)?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
+                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -815,7 +833,7 @@ pub(crate) async fn upsert_location(
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;

-                    let tenant = match Tenant::spawn(
+                    let tenant = match Tenant::spawn_attach(
                        conf,
                        tenant_id,
                        TenantSharedResources {
@@ -824,14 +842,16 @@ pub(crate) async fn upsert_location(
                            deletion_queue_client,
                        },
                        AttachedTenantConf::try_from(new_location_config)?,
-                        None,
                        &TENANTS,
-                        SpawnMode::Normal,
+                        // The LocationConf API does not use marker files, because we have Secondary
+                        // locations where the directory's existence is not a signal that it contains
+                        // all timelines.  See https://github.com/neondatabase/neon/issues/5550
+                        AttachMarkerMode::Ignore,
                        ctx,
                    ) {
                        Ok(tenant) => tenant,
                        Err(e) => {
-                            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
+                            error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
                            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
                        }
                    };
@@ -1097,10 +1117,17 @@ pub(crate) async fn attach_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
+        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

+        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
+        let marker_file_exists = conf
+            .tenant_attaching_mark_file_path(&tenant_id)
+            .try_exists()
+            .context("check for attach marker file existence")?;
+        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
+
        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -170,14 +170,36 @@
 //!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
 //!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
+//! We keep track of the fact that a client is in `Attaching` state in a marker
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
+//!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
+//! Theoretically, it should be ok to remove and re-add remote storage configuration to
+//! the pageserver config at any time, since it doesn't make a difference to
+//! [`Timeline::load_layer_map`].
+//! Of course, the remote timeline dir must not change while we have de-configured
+//! remote storage, i.e., the pageserver must remain the owner of the given prefix
+//! in remote storage.
+//! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(
-        &self,
-        cancel: CancellationToken,
-    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
-            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -1637,11 +1655,7 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let initial_index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1733,11 +1747,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1928,7 +1938,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file()
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

@@ -170,52 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+) -> anyhow::Result<HashSet<TimelineId>> {
    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-        &format!("list timelines for {tenant_id}"),
-        cancel,
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&remote_path)),
+        &format!("list prefixes for {tenant_id}"),
    )
    .await?;

+    if timelines.is_empty() {
+        anyhow::bail!("no timelines found on the remote storage")
+    }
+
    let mut timeline_ids = HashSet::new();
-    let mut other_prefixes = HashSet::new();

-    tracing::info!("list_remote_timelines prefixes:");
-    for p in &listing.prefixes {
-        tracing::info!("  '{p}'");
-    }
-    tracing::info!("list_remote_timelines keys:");
-    for p in &listing.keys {
-        tracing::info!("  '{p}'");
-    }
+    for timeline_remote_storage_key in timelines {
+        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
+            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
+            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
+            continue;
+        }

-    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
-            Err(_) => other_prefixes.insert(object_name.to_string()),
-        };
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
+
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
    }

-    for key in listing.keys {
-        let object_name = key
-            .object_name()
-            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
-        other_prefixes.insert(object_name.to_string());
-    }
-
-    Ok((timeline_ids, other_prefixes))
+    Ok(timeline_ids)
 }

 async fn do_download_index_part(
@@ -223,11 +224,10 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

-    let index_part_bytes = download_retry_forever(
+    let index_part_bytes = download_retry(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;

@@ -242,7 +242,6 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
-        cancel,
    )
    .await?;

@@ -264,28 +263,19 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
-            .await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -305,14 +295,8 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation.previous(),
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -356,14 +340,13 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
-                .await
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
        }
    }
 }
@@ -393,23 +376,3 @@ where
    )
    .await
 }
-
-async fn download_retry_forever<T, O, F>(
-    op: O,
-    description: &str,
-    cancel: CancellationToken,
-) -> Result<T, DownloadError>
-where
-    O: FnMut() -> F,
-    F: Future<Output = Result<T, DownloadError>>,
-{
-    backoff::retry(
-        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        u32::MAX,
-        description,
-        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
-    )
-    .await
-}
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -294,7 +294,6 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
-        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
 }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -27,13 +27,14 @@ use std::collections::VecDeque;
 use std::io;
 use std::io::prelude::*;
 use std::ops::{Deref, DerefMut};
-use std::os::unix::io::{AsRawFd, RawFd};
+use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
-use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
+use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

@@ -47,7 +48,6 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
-use crate::task_mgr::BACKGROUND_RUNTIME;
 use crate::walrecord::NeonWalRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -72,8 +72,6 @@ pub(crate) struct BufferTag {

 struct ProcessInput {
    stdin: ChildStdin,
-    stderr_fd: RawFd,
-    stdout_fd: RawFd,
    n_requests: usize,
 }

@@ -121,6 +119,7 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
+    /// CANCEL SAFETY: NOT CANCEL SAFE.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -153,6 +152,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
+                    .await
                };
                img = Some(result?);

@@ -173,6 +173,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
+            .await
        }
    }
 }
@@ -194,7 +195,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -283,19 +284,20 @@ impl PostgresRedoManager {
                );
                // Avoid concurrent callers hitting the same issue.
                // We can't prevent it from happening because we want to enable parallelism.
-                let mut guard = self.redo_process.write().unwrap();
-                match &*guard {
-                    Some(current_field_value) => {
-                        if Arc::ptr_eq(current_field_value, &proc) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            *guard = None;
+                {
+                    let mut guard = self.redo_process.write().unwrap();
+                    match &*guard {
+                        Some(current_field_value) => {
+                            if Arc::ptr_eq(current_field_value, &proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                *guard = None;
+                            }
+                        }
+                        None => {
+                            // Another thread was faster to observe the error, and already took the process out of rotation.
                        }
                    }
-                    None => {
-                        // Another thread was faster to observe the error, and already took the process out of rotation.
-                    }
                }
-                drop(guard);
                // NB: there may still be other concurrent threads using `proc`.
                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
@@ -308,7 +310,12 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
+                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
+                wait_done
+                    .wait_for(|v| *v)
+                    .await
+                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -619,7 +626,8 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
-    stderr: Mutex<ChildStderr>,
+    stderr_logger_cancel: CancellationToken,
+    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -668,7 +676,6 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
-
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -682,16 +689,73 @@ impl WalRedoProcess {
        set_nonblock_or_log_err!(stdout)?;
        set_nonblock_or_log_err!(stderr)?;

+        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
+
        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

+        let stderr_logger_cancel = CancellationToken::new();
+        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
+            tokio::sync::watch::channel(false);
+        tokio::spawn({
+            let stderr_logger_cancel = stderr_logger_cancel.clone();
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    let _ = stderr_logger_task_done_tx.send(true);
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                loop {
+                    // NB: we purposefully don't do a select! for the cancellation here.
+                    // The cancellation would likely cause us to miss stderr messages.
+                    // We can rely on this to return from .await because when we SIGKILL
+                    // the child, the writing end of the stderr pipe gets closed.
+                    match stderr.readable_mut().await {
+                        Ok(mut guard) => {
+                            let mut errbuf = [0; 16384];
+                            let res = guard.try_io(|fd| {
+                                use std::io::Read;
+                                fd.get_mut().read(&mut errbuf)
+                            });
+                            match res {
+                                Ok(Ok(0)) => {
+                                    // it closed the stderr pipe
+                                    break;
+                                }
+                                Ok(Ok(n)) => {
+                                    // The message might not be split correctly into lines here. But this is
+                                    // good enough, the important thing is to get the message to the log.
+                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
+                                    error!(output, "received output");
+                                },
+                                Ok(Err(e)) => {
+                                    error!(error = ?e, "read() error, waiting for cancellation");
+                                    stderr_logger_cancel.cancelled().await;
+                                    error!(error = ?e, "read() error, cancellation complete");
+                                    break;
+                                }
+                                Err(e) => {
+                                    let _e: tokio::io::unix::TryIoError = e;
+                                    // the read() returned WouldBlock, that's expected
+                                }
+                            }
+                        }
+                        Err(e) => {
+                            error!(error = ?e, "read() error, waiting for cancellation");
+                            stderr_logger_cancel.cancelled().await;
+                            error!(error = ?e, "read() error, cancellation complete");
+                            break;
+                        }
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+        });
+
        Ok(Self {
            conf,
            tenant_id,
            child: Some(child),
            stdin: Mutex::new(ProcessInput {
-                stdout_fd: stdout.as_raw_fd(),
-                stderr_fd: stderr.as_raw_fd(),
                stdin,
                n_requests: 0,
            }),
@@ -700,7 +764,8 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
-            stderr: Mutex::new(stderr),
+            stderr_logger_cancel,
+            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -774,19 +839,11 @@ impl WalRedoProcess {
        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
        let mut nwrite = 0usize;

-        // Prepare for calling poll()
-        let mut pollfds = [
-            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
-            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
-            PollFd::new(proc.stdout_fd, PollFlags::POLLIN),
-        ];
+        let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];

-        // We do two things simultaneously: send the old base image and WAL records to
-        // the child process's stdin and forward any logging
-        // information that the child writes to its stderr to the page server's log.
        while nwrite < writebuf.len() {
            let n = loop {
-                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
                    Err(nix::errno::Errno::EINTR) => continue,
                    res => break res,
                }
@@ -796,31 +853,8 @@ impl WalRedoProcess {
                anyhow::bail!("WAL redo timed out");
            }

-            // If we have some messages in stderr, forward them to the log.
-            let err_revents = pollfds[1].revents().unwrap();
-            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                let mut errbuf: [u8; 16384] = [0; 16384];
-                let mut stderr = self.stderr.lock().unwrap();
-                let len = stderr.read(&mut errbuf)?;
-
-                // The message might not be split correctly into lines here. But this is
-                // good enough, the important thing is to get the message to the log.
-                if len > 0 {
-                    error!(
-                        "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..len])
-                    );
-
-                    // To make sure we capture all log from the process if it fails, keep
-                    // reading from the stderr, before checking the stdout.
-                    continue;
-                }
-            } else if err_revents.contains(PollFlags::POLLHUP) {
-                anyhow::bail!("WAL redo process closed its stderr unexpectedly");
-            }
-
            // If 'stdin' is writeable, do write.
-            let in_revents = pollfds[0].revents().unwrap();
+            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
            } else if in_revents.contains(PollFlags::POLLHUP) {
@@ -845,6 +879,7 @@ impl WalRedoProcess {
        // advancing processed responses number.

        let mut output = self.stdout.lock().unwrap();
+        let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
        let n_processed_responses = output.n_processed_responses;
        while n_processed_responses + output.pending_responses.len() <= request_no {
            // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -855,7 +890,10 @@ impl WalRedoProcess {
                // We do two things simultaneously: reading response from stdout
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
-                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
                        Err(nix::errno::Errno::EINTR) => continue,
                        res => break res,
                    }
@@ -865,31 +903,8 @@ impl WalRedoProcess {
                    anyhow::bail!("WAL redo timed out");
                }

-                // If we have some messages in stderr, forward them to the log.
-                let err_revents = pollfds[1].revents().unwrap();
-                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    let mut errbuf: [u8; 16384] = [0; 16384];
-                    let mut stderr = self.stderr.lock().unwrap();
-                    let len = stderr.read(&mut errbuf)?;
-
-                    // The message might not be split correctly into lines here. But this is
-                    // good enough, the important thing is to get the message to the log.
-                    if len > 0 {
-                        error!(
-                            "wal-redo-postgres: {}",
-                            String::from_utf8_lossy(&errbuf[0..len])
-                        );
-
-                        // To make sure we capture all log from the process if it fails, keep
-                        // reading from the stderr, before checking the stdout.
-                        continue;
-                    }
-                } else if err_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stderr unexpectedly");
-                }
-
                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = pollfds[2].revents().unwrap();
+                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
                } else if out_revents.contains(PollFlags::POLLHUP) {
@@ -985,6 +1000,8 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait();
+        self.stderr_logger_cancel.cancel();
+        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }

@@ -1066,7 +1083,7 @@ impl Drop for NoLeakChild {
        // Offload the kill+wait of the child process into the background.
        // If someone stops the runtime, we'll leak the child process.
        // We can ignore that case because we only stop the runtime on pageserver exit.
-        BACKGROUND_RUNTIME.spawn(async move {
+        tokio::runtime::Handle::current().spawn(async move {
            tokio::task::spawn_blocking(move || {
                // Intentionally don't inherit the tracing context from whoever is dropping us.
                // This thread here is going to outlive of our dropper.
@@ -1199,6 +1216,22 @@ mod tests {
        assert_eq!(page, crate::ZERO_PAGE);
    }

+    #[tokio::test]
+    async fn test_stderr() {
+        let h = RedoHarness::new().unwrap();
+        h
+            .manager
+            .request_redo(
+                Key::from_i128(0),
+                Lsn::INVALID,
+                None,
+                short_records(),
+                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+            )
+            .await
+            .unwrap_err();
+    }
+
    #[allow(clippy::octal_escapes)]
    fn short_records() -> Vec<(Lsn, NeonWalRecord)> {
        vec![
@@ -1227,6 +1260,8 @@ mod tests {

    impl RedoHarness {
        fn new() -> anyhow::Result<Self> {
+            crate::tenant::harness::setup_logging();
+
            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -23,6 +23,23 @@ EXTENSION = neon
 DATA = neon--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

+EXTRA_CLEAN = \
+	libwalproposer.a
+
+WALPROP_OBJS = \
+	$(WIN32RES) \
+	walproposer.o \
+	neon_utils.o \
+	walproposer_compat.o
+
+.PHONY: walproposer-lib
+walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
+walproposer-lib: libwalproposer.a;
+
+.PHONY: libwalproposer.a
+libwalproposer.a: $(WALPROP_OBJS)
+	rm -f $@
+	$(AR) $(AROPT) $@ $^

 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -79,7 +79,7 @@ static int	CompareLsn(const void *a, const void *b);
 static char *FormatSafekeeperState(SafekeeperState state);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
-static char *FormatEvents(uint32 events);
+static char *FormatEvents(WalProposer *wp, uint32 events);

 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -98,7 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		port = strchr(host, ':');
 		if (port == NULL)
 		{
-			elog(FATAL, "port is not specified");
+			walprop_log(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
@@ -106,12 +106,11 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			elog(FATAL, "Too many safekeepers");
+			walprop_log(FATAL, "Too many safekeepers");
 		}
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
 		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
-		wp->safekeeper[wp->n_safekeepers].conn = NULL;
 		wp->safekeeper[wp->n_safekeepers].wp = wp;

 		{
@@ -122,13 +121,11 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
-				elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+				walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}

 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->safekeeper[wp->n_safekeepers].xlogreader = wp->api.wal_reader_allocate();
-		if (wp->safekeeper[wp->n_safekeepers].xlogreader == NULL)
-			elog(FATAL, "Failed to allocate xlog reader");
+		wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
 		wp->safekeeper[wp->n_safekeepers].flushWrite = false;
 		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
@@ -136,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		elog(FATAL, "Safekeepers addresses are not specified");
+		walprop_log(FATAL, "Safekeepers addresses are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;

@@ -144,27 +141,47 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->greetRequest.tag = 'g';
 	wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
 	wp->greetRequest.pgVersion = PG_VERSION_NUM;
-	wp->api.strong_random(&wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
+	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
 	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
-		elog(FATAL, "neon.timeline_id is not provided");
+		walprop_log(FATAL, "neon.timeline_id is not provided");
 	if (*wp->config->neon_timeline != '\0' &&
 		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		elog(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+		walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
 	if (!wp->config->neon_tenant)
-		elog(FATAL, "neon.tenant_id is not provided");
+		walprop_log(FATAL, "neon.tenant_id is not provided");
 	if (*wp->config->neon_tenant != '\0' &&
 		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		elog(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+		walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);

-	wp->greetRequest.timeline = wp->api.get_timeline_id();
+	wp->greetRequest.timeline = wp->config->pgTimeline;
 	wp->greetRequest.walSegSize = wp->config->wal_segment_size;

-	wp->api.init_event_set(wp->n_safekeepers);
+	wp->api.init_event_set(wp);

 	return wp;
 }

+void
+WalProposerFree(WalProposer *wp)
+{
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		Assert(sk->outbuf.data != NULL);
+		pfree(sk->outbuf.data);
+		if (sk->voteResponse.termHistory.entries)
+			pfree(sk->voteResponse.termHistory.entries);
+		sk->voteResponse.termHistory.entries = NULL;
+	}
+	if (wp->propTermHistory.entries != NULL)
+		pfree(wp->propTermHistory.entries);
+	wp->propTermHistory.entries = NULL;
+	
+	pfree(wp);
+}
+
 /*
 * Create new AppendRequest message and start sending it. This function is
 * called from walsender every time the new WAL is available.
@@ -190,10 +207,10 @@ WalProposerPoll(WalProposer *wp)
 		Safekeeper *sk = NULL;
 		int			rc = 0;
 		uint32		events = 0;
-		TimestampTz now = wp->api.get_current_timestamp();
+		TimestampTz now = wp->api.get_current_timestamp(wp);
 		long		timeout = TimeToReconnect(wp, now);

-		rc = wp->api.wait_event_set(timeout, &sk, &events);
+		rc = wp->api.wait_event_set(wp, timeout, &sk, &events);

 		/* Exit loop if latch is set (we got new WAL) */
 		if ((rc == 1 && events & WL_LATCH_SET))
@@ -224,14 +241,14 @@ WalProposerPoll(WalProposer *wp)
 			 */
 			if (!wp->config->syncSafekeepers)
 			{
-				XLogRecPtr	flushed = wp->api.get_flush_rec_ptr();
+				XLogRecPtr	flushed = wp->api.get_flush_rec_ptr(wp);

 				if (flushed > wp->availableLsn)
 					break;
 			}
 		}

-		now = wp->api.get_current_timestamp();
+		now = wp->api.get_current_timestamp(wp);
 		/* timeout expired: poll state */
 		if (rc == 0 || TimeToReconnect(wp, now) <= 0)
 		{
@@ -249,7 +266,7 @@ WalProposerPoll(WalProposer *wp)
 			/*
 			 * Abandon connection attempts which take too long.
 			 */
-			now = wp->api.get_current_timestamp();
+			now = wp->api.get_current_timestamp(wp);
 			for (int i = 0; i < wp->n_safekeepers; i++)
 			{
 				Safekeeper *sk = &wp->safekeeper[i];
@@ -257,7 +274,7 @@ WalProposerPoll(WalProposer *wp)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
-					elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
+					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
 						 sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
@@ -296,10 +313,10 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
 {
 	WalProposer *wp = to_remove->wp;

-	/* Remove the existing event set */
-	wp->api.free_event_set();
+	/* Remove the existing event set, assign sk->eventPos = -1 */
+	wp->api.free_event_set(wp);
 	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp->n_safekeepers);
+	wp->api.init_event_set(wp);

 	/*
 	 * loop through the existing safekeepers. If they aren't the one we're
@@ -311,13 +328,11 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
 		uint32		desired_events = WL_NO_EVENTS;
 		Safekeeper *sk = &wp->safekeeper[i];

-		sk->eventPos = -1;
-
 		if (sk == to_remove)
 			continue;

 		/* If this safekeeper isn't offline, add an event for it! */
-		if (sk->conn != NULL)
+		if (sk->state != SS_OFFLINE)
 		{
 			desired_events = SafekeeperStateDesiredEvents(sk->state);
 			/* will set sk->eventPos */
@@ -330,9 +345,7 @@ HackyRemoveWalProposerEvent(Safekeeper *to_remove)
 static void
 ShutdownConnection(Safekeeper *sk)
 {
-	if (sk->conn)
-		sk->wp->api.conn_finish(sk->conn);
-	sk->conn = NULL;
+	sk->wp->api.conn_finish(sk);
 	sk->state = SS_OFFLINE;
 	sk->flushWrite = false;
 	sk->streamingAt = InvalidXLogRecPtr;
@@ -361,23 +374,16 @@ ResetConnection(Safekeeper *sk)
 	}

 	/*
-	 * Try to establish new connection
+	 * Try to establish new connection, it will update sk->conn.
 	 */
-	sk->conn = wp->api.conn_connect_start((char *) &sk->conninfo);
-
-	/*
-	 * "If the result is null, then libpq has been unable to allocate a new
-	 * PGconn structure"
-	 */
-	if (!sk->conn)
-		elog(FATAL, "failed to allocate new PGconn object");
+	wp->api.conn_connect_start(sk);

 	/*
 	 * PQconnectStart won't actually start connecting until we run
 	 * PQconnectPoll. Before we do that though, we need to check that it
 	 * didn't immediately fail.
 	 */
-	if (wp->api.conn_status(sk->conn) == WP_CONNECTION_BAD)
+	if (wp->api.conn_status(sk) == WP_CONNECTION_BAD)
 	{
 		/*---
 		 * According to libpq docs:
@@ -388,15 +394,14 @@ ResetConnection(Safekeeper *sk)
 		 *
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
-		elog(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk->conn));
+		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
+			 sk->host, sk->port, wp->api.conn_error_message(sk));

 		/*
 		 * Even though the connection failed, we still need to clean up the
 		 * object
 		 */
-		wp->api.conn_finish(sk->conn);
-		sk->conn = NULL;
+		wp->api.conn_finish(sk);
 		return;
 	}

@@ -413,10 +418,10 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
+	walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);

 	sk->state = SS_CONNECTING_WRITE;
-	sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
+	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 	wp->api.add_safekeeper_event_set(sk, WL_SOCKET_WRITEABLE);
 	return;
@@ -447,7 +452,7 @@ TimeToReconnect(WalProposer *wp, TimestampTz now)
 static void
 ReconnectSafekeepers(WalProposer *wp)
 {
-	TimestampTz now = wp->api.get_current_timestamp();
+	TimestampTz now = wp->api.get_current_timestamp(wp);

 	if (TimeToReconnect(wp, now) == 0)
 	{
@@ -467,6 +472,8 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
+	WalProposer *wp = sk->wp;
+
 	/*
 	 * Sanity check. We assume further down that the operations don't block
 	 * because the socket is ready.
@@ -481,7 +488,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
+			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
 				 sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */
@@ -517,7 +524,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * requests.
 			 */
 		case SS_VOTING:
-			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
 				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;
@@ -546,7 +553,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
 				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;
@@ -564,7 +571,7 @@ static void
 HandleConnectionEvent(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
-	WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk->conn);
+	WalProposerConnectPollStatusType result = wp->api.conn_connect_poll(sk);

 	/* The new set of events we'll wait on, after updating */
 	uint32		new_events = WL_NO_EVENTS;
@@ -572,9 +579,9 @@ HandleConnectionEvent(Safekeeper *sk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			elog(LOG, "connected with node %s:%s", sk->host,
+			walprop_log(LOG, "connected with node %s:%s", sk->host,
 				 sk->port);
-			sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
+			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 			/*
 			 * We have to pick some event to update event set. We'll
@@ -596,8 +603,8 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;

 		case WP_CONN_POLLING_FAILED:
-			elog(WARNING, "failed to connect to node '%s:%s': %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk->conn));
+			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
+				 sk->host, sk->port, wp->api.conn_error_message(sk));

 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -631,10 +638,10 @@ SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;

-	if (!wp->api.conn_send_query(sk->conn, "START_WAL_PUSH"))
+	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			 sk->host, sk->port, wp->api.conn_error_message(sk->conn));
+		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			 sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -647,7 +654,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;

-	switch (wp->api.conn_get_query_result(sk->conn))
+	switch (wp->api.conn_get_query_result(sk))
 	{
 			/*
 			 * Successful result, move on to starting the handshake
@@ -670,8 +677,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			break;

 		case WP_EXEC_FAILED:
-			elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-				 sk->host, sk->port, wp->api.conn_error_message(sk->conn));
+			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
+				 sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;

@@ -681,7 +688,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 * wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
+			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
 				 sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
@@ -717,7 +724,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;

-	elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);

 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -737,7 +744,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		if (wp->n_connected == wp->quorum)
 		{
 			wp->propTerm++;
-			elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
+			walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);

 			wp->voteRequest = (VoteRequest)
 			{
@@ -750,7 +757,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	else if (sk->greetResponse.term > wp->propTerm)
 	{
 		/* Another compute with higher term is running. */
-		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 			 sk->host, sk->port,
 			 sk->greetResponse.term, wp->propTerm);
 	}
@@ -792,7 +799,7 @@ SendVoteRequest(Safekeeper *sk)
 	WalProposer *wp = sk->wp;

 	/* We have quorum for voting, send our vote request */
-	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
+	walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
 		return;
@@ -809,7 +816,7 @@ RecvVoteResponse(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;

-	elog(LOG,
+	walprop_log(LOG,
 		 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
 		 sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
 		 LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -824,7 +831,7 @@ RecvVoteResponse(Safekeeper *sk)
 	if ((!sk->voteResponse.voteGiven) &&
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
-		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 			 sk->host, sk->port,
 			 sk->voteResponse.term, wp->propTerm);
 	}
@@ -861,49 +868,27 @@ RecvVoteResponse(Safekeeper *sk)
 static void
 HandleElectedProposer(WalProposer *wp)
 {
-	FILE* f;
-	XLogRecPtr lrRestartLsn;
-
 	DetermineEpochStartLsn(wp);

-	/*
-	 * If there are active logical replication subscription we need
-	 * to provide enough WAL for their WAL senders based on th position
-	 * of their replication slots.
-	 */
-	f = fopen("restart.lsn", "rb");
-	if (f != NULL && !wp->config->syncSafekeepers)
-	{
-		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
-		fclose(f);
-		if (lrRestartLsn != InvalidXLogRecPtr)
-		{
-			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
-			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
-		}
-	}
-
 	/*
 	 * Check if not all safekeepers are up-to-date, we need to download WAL
 	 * needed to synchronize them
 	 */
 	if (wp->truncateLsn < wp->propEpochStartLsn)
 	{
-		elog(LOG,
+		walprop_log(LOG,
 			 "start recovery because truncateLsn=%X/%X is not "
 			 "equal to epochStartLsn=%X/%X",
 			 LSN_FORMAT_ARGS(wp->truncateLsn),
 			 LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 		/* Perform recovery */
 		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
-			elog(FATAL, "Failed to recover state");
+			walprop_log(FATAL, "Failed to recover state");
 	}
 	else if (wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */
-		wp->api.finish_sync_safekeepers(wp->propEpochStartLsn);
+		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
 		/* unreachable */
 	}

@@ -1004,7 +989,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
-					elog(WARNING,
+					walprop_log(WARNING,
 						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
 						 LSN_FORMAT_ARGS(wp->timelineStartLsn),
 						 LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
@@ -1020,12 +1005,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
-		wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn();
+		wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
 		if (wp->timelineStartLsn == InvalidXLogRecPtr)
 		{
-			wp->timelineStartLsn = wp->api.get_redo_start_lsn();
+			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
 		}
-		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+		walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}

 	/*
@@ -1052,7 +1037,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;

-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		 wp->quorum,
 		 wp->propTerm,
 		 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
@@ -1066,7 +1051,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 	 */
 	if (!wp->config->syncSafekeepers)
 	{
-		WalproposerShmemState *walprop_shared = wp->api.get_shmem_state();
+		WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);

 		/*
 		 * Basebackup LSN always points to the beginning of the record (not
@@ -1074,7 +1059,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 		 * Safekeepers don't skip header as they need continious stream of
 		 * data, so correct LSN for comparison.
 		 */
-		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn())
+		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
 			 * However, allow to proceed if previously elected leader was me;
@@ -1084,14 +1069,21 @@ DetermineEpochStartLsn(WalProposer *wp)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
-				elog(PANIC,
+				walprop_log(PANIC,
 					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 					 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn()));
+					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
+
+	/*
+	 * WalProposer has just elected itself and initialized history, so
+	 * we can call election callback. Usually it updates truncateLsn to
+	 * fetch WAL for logical replication.
+	 */
+	wp->api.after_election(wp);
 }

 /*
@@ -1162,7 +1154,7 @@ SendProposerElected(Safekeeper *sk)
 			 */
 			sk->startStreamingAt = wp->truncateLsn;

-			elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
 				 sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
 				 LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
@@ -1197,7 +1189,7 @@ SendProposerElected(Safekeeper *sk)
 	msg.timelineStartLsn = wp->timelineStartLsn;

 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
-	elog(LOG,
+	walprop_log(LOG,
 		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
 		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));

@@ -1362,13 +1354,12 @@ SendAppendRequests(Safekeeper *sk)
 		req = &sk->appendRequest;
 		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);

-		ereport(DEBUG2,
-				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 						req->endLsn - req->beginLsn,
 						LSN_FORMAT_ARGS(req->beginLsn),
 						LSN_FORMAT_ARGS(req->endLsn),
 						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port)));
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);

 		resetStringInfo(&sk->outbuf);

@@ -1378,13 +1369,13 @@ SendAppendRequests(Safekeeper *sk)
 		/* write the WAL itself */
 		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
 		/* wal_read will raise error on failure */
-		wp->api.wal_read(sk->xlogreader,
+		wp->api.wal_read(sk,
 						 &sk->outbuf.data[sk->outbuf.len],
 						 req->beginLsn,
 						 req->endLsn - req->beginLsn);
 		sk->outbuf.len += req->endLsn - req->beginLsn;

-		writeResult = wp->api.conn_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
+		writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);

 		/* Mark current message as sent, whatever the result is */
 		sk->streamingAt = endLsn;
@@ -1406,9 +1397,9 @@ SendAppendRequests(Safekeeper *sk)
 				return true;

 			case PG_ASYNC_WRITE_FAIL:
-				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
 					 sk->host, sk->port, FormatSafekeeperState(sk->state),
-					 wp->api.conn_error_message(sk->conn));
+					 wp->api.conn_error_message(sk));
 				ShutdownConnection(sk);
 				return false;
 			default:
@@ -1446,17 +1437,16 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;

-		ereport(DEBUG2,
-				(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
 						sk->appendResponse.term,
 						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
 						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-						sk->host, sk->port)));
+						sk->host, sk->port);

 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/* Another compute with higher term is running. */
-			elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
 				 sk->host, sk->port,
 				 sk->appendResponse.term, wp->propTerm);
 		}
@@ -1484,7 +1474,7 @@ RecvAppendResponses(Safekeeper *sk)

 /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 void
-ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
+ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf)
 {
 	uint8		nkeys;
 	int			i;
@@ -1502,7 +1492,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
 				 rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
@@ -1510,7 +1500,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
 				 LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
@@ -1518,7 +1508,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
 				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
@@ -1526,7 +1516,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
 				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
@@ -1539,7 +1529,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)

 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
 					 rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
@@ -1554,7 +1544,7 @@ ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback *rf)
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
+			walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1637,7 +1627,7 @@ HandleSafekeeperResponse(WalProposer *wp)
 		 * Advance the replication slot to free up old WAL files. Note that
 		 * slot doesn't exist if we are in syncSafekeepers mode.
 		 */
-		wp->api.confirm_wal_streamed(wp->truncateLsn);
+		wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
 	}

 	/*
@@ -1684,7 +1674,7 @@ HandleSafekeeperResponse(WalProposer *wp)
 			 */
 			BroadcastAppendRequest(wp);

-			wp->api.finish_sync_safekeepers(wp->propEpochStartLsn);
+			wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
 			/* unreachable */
 		}
 	}
@@ -1699,7 +1689,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 {
 	WalProposer *wp = sk->wp;

-	switch (wp->api.conn_async_read(sk->conn, buf, buf_size))
+	switch (wp->api.conn_async_read(sk, buf, buf_size))
 	{
 		case PG_ASYNC_READ_SUCCESS:
 			return true;
@@ -1709,9 +1699,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 			return false;

 		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
+			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
 				 sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk->conn));
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1749,12 +1739,12 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
 			 sk->port, FormatSafekeeperState(sk->state));
 		ResetConnection(sk);
 		return false;
 	}
-	sk->latestMsgReceivedAt = wp->api.get_current_timestamp();
+	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 	switch (tag)
 	{
 		case 'g':
@@ -1798,7 +1788,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParsePageserverFeedbackMessage(&s, &msg->rf);
+					ParsePageserverFeedbackMessage(wp, &s, &msg->rf);
 				pq_getmsgend(&s);
 				return true;
 			}
@@ -1823,11 +1813,11 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	WalProposer *wp = sk->wp;
 	uint32		events;

-	if (!wp->api.conn_blocking_write(sk->conn, msg, msg_size))
+	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
-		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
 			 sk->host, sk->port, FormatSafekeeperState(sk->state),
-			 wp->api.conn_error_message(sk->conn));
+			 wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1857,7 +1847,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 {
 	WalProposer *wp = sk->wp;

-	switch (wp->api.conn_async_write(sk->conn, msg, msg_size))
+	switch (wp->api.conn_async_write(sk, msg, msg_size))
 	{
 		case PG_ASYNC_WRITE_SUCCESS:
 			return true;
@@ -1872,9 +1862,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
-			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
 				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk->conn));
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1902,7 +1892,7 @@ AsyncFlush(Safekeeper *sk)
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
 	 */
-	switch (wp->api.conn_flush(sk->conn))
+	switch (wp->api.conn_flush(sk))
 	{
 		case 0:
 			/* flush is done */
@@ -1911,9 +1901,9 @@ AsyncFlush(Safekeeper *sk)
 			/* Nothing to do; try again when the socket's ready */
 			return false;
 		case -1:
-			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
 				 sk->host, sk->port, FormatSafekeeperState(sk->state),
-				 wp->api.conn_error_message(sk->conn));
+				 wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1942,11 +1932,11 @@ CompareLsn(const void *a, const void *b)
 *
 * The strings are intended to be used as a prefix to "state", e.g.:
 *
- *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
 *
 * If this sort of phrasing doesn't fit the message, instead use something like:
 *
- *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
 */
 static char *
 FormatSafekeeperState(SafekeeperState state)
@@ -1994,6 +1984,7 @@ FormatSafekeeperState(SafekeeperState state)
 static void
 AssertEventsOkForState(uint32 events, Safekeeper *sk)
 {
+	WalProposer *wp = sk->wp;
 	uint32		expected = SafekeeperStateDesiredEvents(sk->state);

 	/*
@@ -2016,8 +2007,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * To give a descriptive message in the case of failure, we use elog
 		 * and then an assertion that's guaranteed to fail.
 		 */
-		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2090,7 +2081,7 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 * The string should not be freed. It should also not be expected to remain the same between
 * function calls. */
 static char *
-FormatEvents(uint32 events)
+FormatEvents(WalProposer *wp, uint32 events)
 {
 	static char return_str[8];

@@ -2119,7 +2110,7 @@ FormatEvents(uint32 events)

 	if (events & (~all_flags))
 	{
-		elog(WARNING, "Event formatting found unexpected component %d",
+		walprop_log(WARNING, "Event formatting found unexpected component %d",
 			 events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -333,24 +333,11 @@ typedef struct Safekeeper
 	 */
 	char		conninfo[MAXCONNINFO];

-	/*
-	 * postgres protocol connection to the WAL acceptor
-	 *
-	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
-	 * reach SS_ACTIVE; not before.
-	 */
-	WalProposerConn *conn;
-
 	/*
 	 * Temporary buffer for the message being sent to the safekeeper.
 	 */
 	StringInfoData outbuf;

-	/*
-	 * WAL reader, allocated for each safekeeper.
-	 */
-	XLogReaderState *xlogreader;
-
 	/*
 	 * Streaming will start here; must be record boundary.
 	 */
@@ -361,13 +348,43 @@ typedef struct Safekeeper
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */

-	int			eventPos;		/* position in wait event set. Equal to -1 if*
-								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
+
+
+	/* postgres-specific fields */
+	#ifndef WALPROPOSER_LIB
+	/*
+	 * postgres protocol connection to the WAL acceptor
+	 *
+	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
+	 * reach SS_ACTIVE; not before.
+	 */
+	WalProposerConn *conn;
+
+	/*
+	 * WAL reader, allocated for each safekeeper.
+	 */
+	XLogReaderState *xlogreader;
+
+	/*
+	 * Position in wait event set. Equal to -1 if no event
+	 */
+	int			eventPos;
+	#endif
+
+
+	/* WalProposer library specifics */
+	#ifdef WALPROPOSER_LIB
+	/*
+	 * Buffer for incoming messages. Usually Rust vector is stored here.
+	 * Caller is responsible for freeing the buffer.
+	 */
+	StringInfoData inbuf;
+	#endif
 } Safekeeper;

 /* Re-exported PostgresPollingStatusType */
@@ -433,7 +450,7 @@ typedef struct walproposer_api
 	 * Get WalproposerShmemState. This is used to store information about last
 	 * elected term.
 	 */
-	WalproposerShmemState *(*get_shmem_state) (void);
+	WalproposerShmemState *(*get_shmem_state) (WalProposer *wp);

 	/*
 	 * Start receiving notifications about new WAL. This is an infinite loop
@@ -443,61 +460,63 @@ typedef struct walproposer_api
 	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);

 	/* Get pointer to the latest available WAL. */
-	XLogRecPtr	(*get_flush_rec_ptr) (void);
+	XLogRecPtr	(*get_flush_rec_ptr) (WalProposer *wp);

 	/* Get current time. */
-	TimestampTz (*get_current_timestamp) (void);
-
-	/* Get postgres timeline. */
-	TimeLineID	(*get_timeline_id) (void);
+	TimestampTz (*get_current_timestamp) (WalProposer *wp);

 	/* Current error message, aka PQerrorMessage. */
-	char	   *(*conn_error_message) (WalProposerConn *conn);
+	char	   *(*conn_error_message) (Safekeeper *sk);

 	/* Connection status, aka PQstatus. */
-	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
+	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);

 	/* Start the connection, aka PQconnectStart. */
-	WalProposerConn *(*conn_connect_start) (char *conninfo);
+	void (*conn_connect_start) (Safekeeper *sk);

 	/* Poll an asynchronous connection, aka PQconnectPoll. */
-	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
+	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);

 	/* Send a blocking SQL query, aka PQsendQuery. */
-	bool		(*conn_send_query) (WalProposerConn *conn, char *query);
+	bool		(*conn_send_query) (Safekeeper *sk, char *query);

 	/* Read the query result, aka PQgetResult. */
-	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
+	WalProposerExecStatusType (*conn_get_query_result) (Safekeeper *sk);

 	/* Flush buffer to the network, aka PQflush. */
-	int			(*conn_flush) (WalProposerConn *conn);
+	int			(*conn_flush) (Safekeeper *sk);

 	/* Close the connection, aka PQfinish. */
-	void		(*conn_finish) (WalProposerConn *conn);
+	void		(*conn_finish) (Safekeeper *sk);

-	/* Try to read CopyData message, aka PQgetCopyData. */
-	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
+	/*
+	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData. 
+	 *
+	 * On success, the data is placed in *buf. It is valid until the next call
+	 * to this function.
+	 */
+	PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);

 	/* Try to write CopyData message, aka PQputCopyData. */
-	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
+	PGAsyncWriteResult (*conn_async_write) (Safekeeper *sk, void const *buf, size_t size);

 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
-	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
+	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);

 	/* Download WAL from startpos to endpos and make it available locally. */
 	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);

 	/* Read WAL from disk to buf. */
-	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
+	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
-	XLogReaderState *(*wal_reader_allocate) (void);
+	void (*wal_reader_allocate) (Safekeeper *sk);

 	/* Deallocate event set. */
-	void		(*free_event_set) (void);
+	void		(*free_event_set) (WalProposer *wp);

 	/* Initialize event set. */
-	void		(*init_event_set) (int n_safekeepers);
+	void		(*init_event_set) (WalProposer *wp);

 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);
@@ -513,22 +532,22 @@ typedef struct walproposer_api
 	 * events mask to indicate events and sets sk to the safekeeper which has
 	 * an event.
 	 */
-	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
+	int			(*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);

 	/* Read random bytes. */
-	bool		(*strong_random) (void *buf, size_t len);
+	bool		(*strong_random) (WalProposer *wp, void *buf, size_t len);

 	/*
 	 * Get a basebackup LSN. Used to cross-validate with the latest available
 	 * LSN on the safekeepers.
 	 */
-	XLogRecPtr	(*get_redo_start_lsn) (void);
+	XLogRecPtr	(*get_redo_start_lsn) (WalProposer *wp);

 	/*
 	 * Finish sync safekeepers with the given LSN. This function should not
 	 * return and should exit the program.
 	 */
-	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);
+	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);

 	/*
 	 * Called after every new message from the safekeeper. Used to propagate
@@ -541,7 +560,22 @@ typedef struct walproposer_api
 	 * Called on peer_horizon_lsn updates. Used to advance replication slot
 	 * and to free up disk space by deleting unnecessary WAL.
 	 */
-	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
+	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
+
+	/*
+	 * Write a log message to the internal log processor. This is used only
+	 * when walproposer is compiled as a library. Otherwise, all logging is
+	 * handled by elog().
+	 */
+	void		(*log_internal) (WalProposer *wp, int level, const char *line);
+
+	/*
+	 * Called right after the proposer was elected, but before it started
+	 * recovery and sent ProposerElected message to the safekeepers.
+	 * 
+	 * Used by logical replication to update truncateLsn.
+	 */
+	void		(*after_election) (WalProposer *wp);
 } walproposer_api;

 /*
@@ -590,6 +624,13 @@ typedef struct WalProposerConfig

 	/* Will be passed to safekeepers in greet request. */
 	uint64		systemId;
+
+	/* Will be passed to safekeepers in greet request. */
+	TimeLineID  pgTimeline;
+
+#ifdef WALPROPOSER_LIB
+	void *callback_data;
+#endif
 } WalProposerConfig;


@@ -666,7 +707,16 @@ extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api
 extern void WalProposerStart(WalProposer *wp);
 extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
 extern void WalProposerPoll(WalProposer *wp);
-extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
-										   PageserverFeedback *rf);
+extern void WalProposerFree(WalProposer *wp);
+
+
+#define WPEVENT		1337	/* special log level for walproposer internal events */
+
+#ifdef WALPROPOSER_LIB
+void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
+#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
+#else
+#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
+#endif

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -0,0 +1,192 @@
+/*
+ * Contains copied/adapted functions from libpq and some internal postgres functions.
+ * This is needed to avoid linking to full postgres server installation. This file
+ * is compiled as a part of libwalproposer static library.
+ */
+
+#include <stdio.h>
+#include "walproposer.h"
+#include "utils/datetime.h"
+#include "miscadmin.h"
+
+void ExceptionalCondition(const char *conditionName,
+						  const char *fileName, int lineNumber)
+{
+	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
+			fileName, lineNumber, conditionName);
+	fprintf(stderr, "aborting...\n");
+	exit(1);
+}
+
+void
+pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
+{
+	if (datalen < 0 || datalen > (msg->len - msg->cursor))
+		ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
+	memcpy(buf, &msg->data[msg->cursor], datalen);
+	msg->cursor += datalen;
+}
+
+/* --------------------------------
+ *		pq_getmsgint	- get a binary integer from a message buffer
+ *
+ *		Values are treated as unsigned.
+ * --------------------------------
+ */
+unsigned int
+pq_getmsgint(StringInfo msg, int b)
+{
+	unsigned int result;
+	unsigned char n8;
+	uint16		n16;
+	uint32		n32;
+
+	switch (b)
+	{
+		case 1:
+			pq_copymsgbytes(msg, (char *) &n8, 1);
+			result = n8;
+			break;
+		case 2:
+			pq_copymsgbytes(msg, (char *) &n16, 2);
+			result = pg_ntoh16(n16);
+			break;
+		case 4:
+			pq_copymsgbytes(msg, (char *) &n32, 4);
+			result = pg_ntoh32(n32);
+			break;
+		default:
+			fprintf(stderr, "unsupported integer size %d\n", b);
+			ExceptionalCondition("unsupported integer size", __FILE__, __LINE__);
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+	return result;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer
+ *
+ * It is tempting to merge this with pq_getmsgint, but we'd have to make the
+ * result int64 for all data widths --- that could be a big performance
+ * hit on machines where int64 isn't efficient.
+ * --------------------------------
+ */
+int64
+pq_getmsgint64(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return pg_ntoh64(n64);
+}
+
+/* --------------------------------
+ *		pq_getmsgbyte	- get a raw byte from a message buffer
+ * --------------------------------
+ */
+int
+pq_getmsgbyte(StringInfo msg)
+{
+	if (msg->cursor >= msg->len)
+		ExceptionalCondition("no data left in message", __FILE__, __LINE__);
+	return (unsigned char) msg->data[msg->cursor++];
+}
+
+/* --------------------------------
+ *		pq_getmsgbytes	- get raw data from a message buffer
+ *
+ *		Returns a pointer directly into the message buffer; note this
+ *		may not have any particular alignment.
+ * --------------------------------
+ */
+const char *
+pq_getmsgbytes(StringInfo msg, int datalen)
+{
+	const char *result;
+
+	if (datalen < 0 || datalen > (msg->len - msg->cursor))
+		ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
+	result = &msg->data[msg->cursor];
+	msg->cursor += datalen;
+	return result;
+}
+
+/* --------------------------------
+ *		pq_getmsgstring - get a null-terminated text string (with conversion)
+ *
+ *		May return a pointer directly into the message buffer, or a pointer
+ *		to a palloc'd conversion result.
+ * --------------------------------
+ */
+const char *
+pq_getmsgstring(StringInfo msg)
+{
+	char	   *str;
+	int			slen;
+
+	str = &msg->data[msg->cursor];
+
+	/*
+	 * It's safe to use strlen() here because a StringInfo is guaranteed to
+	 * have a trailing null byte.  But check we found a null inside the
+	 * message.
+	 */
+	slen = strlen(str);
+	if (msg->cursor + slen >= msg->len)
+		ExceptionalCondition("invalid string in message", __FILE__, __LINE__);
+	msg->cursor += slen + 1;
+
+	return str;
+}
+
+/* --------------------------------
+ *		pq_getmsgend	- verify message fully consumed
+ * --------------------------------
+ */
+void
+pq_getmsgend(StringInfo msg)
+{
+	if (msg->cursor != msg->len)
+		ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
+}
+
+
+/*
+ * Produce a C-string representation of a TimestampTz.
+ *
+ * This is mostly for use in emitting messages.
+ */
+const char *
+timestamptz_to_str(TimestampTz t)
+{
+	static char buf[MAXDATELEN + 1];
+
+	snprintf(buf, sizeof(buf), "TimestampTz(%ld)", t);
+	return buf;
+}
+
+bool
+TimestampDifferenceExceeds(TimestampTz start_time,
+								TimestampTz stop_time,
+								int msec)
+{
+	TimestampTz diff = stop_time - start_time;
+	return (diff >= msec * INT64CONST(1000));
+}
+
+void
+WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
+{
+	char buf[1024];
+	va_list		args;
+
+	fmt = _(fmt);
+
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	wp->api.log_internal(wp, elevel, buf);
+}
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -73,7 +73,8 @@ static void walprop_register_bgworker(void);
 static void walprop_pg_init_standalone_sync_safekeepers(void);
 static void walprop_pg_init_walsender(void);
 static void walprop_pg_init_bgworker(void);
-static TimestampTz walprop_pg_get_current_timestamp(void);
+static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
+static TimeLineID walprop_pg_get_timeline_id(void);
 static void walprop_pg_load_libpqwalreceiver(void);

 static process_interrupts_callback_t PrevProcessInterruptsCallback;
@@ -104,6 +105,7 @@ init_walprop_config(bool syncSafekeepers)
 		walprop_config.systemId = GetSystemIdentifier();
 	else
 		walprop_config.systemId = 0;
+	walprop_config.pgTimeline = walprop_pg_get_timeline_id();
 }

 /*
@@ -136,7 +138,7 @@ WalProposerMain(Datum main_arg)
 	walprop_pg_load_libpqwalreceiver();

 	wp = WalProposerCreate(&walprop_config, walprop_pg);
-	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp();
+	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);

 	walprop_pg_init_walsender();
 	WalProposerStart(wp);
@@ -379,7 +381,7 @@ nwp_shmem_startup_hook(void)
 }

 static WalproposerShmemState *
-walprop_pg_get_shmem_state(void)
+walprop_pg_get_shmem_state(WalProposer *wp)
 {
 	Assert(walprop_shared != NULL);
 	return walprop_shared;
@@ -505,7 +507,7 @@ walprop_pg_init_bgworker(void)
 }

 static XLogRecPtr
-walprop_pg_get_flush_rec_ptr(void)
+walprop_pg_get_flush_rec_ptr(WalProposer *wp)
 {
 #if PG_MAJORVERSION_NUM < 15
 	return GetFlushRecPtr();
@@ -515,7 +517,7 @@ walprop_pg_get_flush_rec_ptr(void)
 }

 static TimestampTz
-walprop_pg_get_current_timestamp(void)
+walprop_pg_get_current_timestamp(WalProposer *wp)
 {
 	return GetCurrentTimestamp();
 }
@@ -565,15 +567,15 @@ ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)

 /* Exported function definitions */
 static char *
-walprop_error_message(WalProposerConn *conn)
+walprop_error_message(Safekeeper *sk)
 {
-	return PQerrorMessage(conn->pg_conn);
+	return PQerrorMessage(sk->conn->pg_conn);
 }

 static WalProposerConnStatusType
-walprop_status(WalProposerConn *conn)
+walprop_status(Safekeeper *sk)
 {
-	switch (PQstatus(conn->pg_conn))
+	switch (PQstatus(sk->conn->pg_conn))
 	{
 		case CONNECTION_OK:
 			return WP_CONNECTION_OK;
@@ -584,16 +586,17 @@ walprop_status(WalProposerConn *conn)
 	}
 }

-static WalProposerConn *
-walprop_connect_start(char *conninfo)
+static void
+walprop_connect_start(Safekeeper *sk)
 {
-	WalProposerConn *conn;
 	PGconn	   *pg_conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;

+	Assert(sk->conn == NULL);
+
 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
 	 * environment variable was set, use that as the password.
@@ -611,7 +614,7 @@ walprop_connect_start(char *conninfo)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = conninfo;
+	values[n] = sk->conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -619,11 +622,11 @@ walprop_connect_start(char *conninfo)
 	pg_conn = PQconnectStartParams(keywords, values, 1);

 	/*
-	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
-	 * replicate the behavior of PQconnectStart here.
+	 * "If the result is null, then libpq has been unable to allocate a new
+	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		return NULL;
+		elog(FATAL, "failed to allocate new PGconn object");

 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -632,20 +635,19 @@ walprop_connect_start(char *conninfo)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
+	sk->conn = palloc(sizeof(WalProposerConn));
+	sk->conn->pg_conn = pg_conn;
+	sk->conn->is_nonblocking = false;	/* connections always start in blocking
 									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
+	sk->conn->recvbuf = NULL;
 }

 static WalProposerConnectPollStatusType
-walprop_connect_poll(WalProposerConn *conn)
+walprop_connect_poll(Safekeeper *sk)
 {
 	WalProposerConnectPollStatusType return_val;

-	switch (PQconnectPoll(conn->pg_conn))
+	switch (PQconnectPoll(sk->conn->pg_conn))
 	{
 		case PGRES_POLLING_FAILED:
 			return_val = WP_CONN_POLLING_FAILED;
@@ -682,24 +684,24 @@ walprop_connect_poll(WalProposerConn *conn)
 }

 static bool
-walprop_send_query(WalProposerConn *conn, char *query)
+walprop_send_query(Safekeeper *sk, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(conn, false))
+	if (!ensure_nonblocking_status(sk->conn, false))
 		return false;

 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
+	if (!PQsendQuery(sk->conn->pg_conn, query))
 		return false;

 	return true;
 }

 static WalProposerExecStatusType
-walprop_get_query_result(WalProposerConn *conn)
+walprop_get_query_result(Safekeeper *sk)
 {
 	PGresult   *result;
 	WalProposerExecStatusType return_val;
@@ -708,14 +710,14 @@ walprop_get_query_result(WalProposerConn *conn)
 	char	   *unexpected_success = NULL;

 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
+	if (!PQconsumeInput(sk->conn->pg_conn))
 		return WP_EXEC_FAILED;

-	if (PQisBusy(conn->pg_conn))
+	if (PQisBusy(sk->conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;


-	result = PQgetResult(conn->pg_conn);
+	result = PQgetResult(sk->conn->pg_conn);

 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -777,24 +779,28 @@ walprop_get_query_result(WalProposerConn *conn)
 }

 static pgsocket
-walprop_socket(WalProposerConn *conn)
+walprop_socket(Safekeeper *sk)
 {
-	return PQsocket(conn->pg_conn);
+	return PQsocket(sk->conn->pg_conn);
 }

 static int
-walprop_flush(WalProposerConn *conn)
+walprop_flush(Safekeeper *sk)
 {
-	return (PQflush(conn->pg_conn));
+	return (PQflush(sk->conn->pg_conn));
 }

 static void
-walprop_finish(WalProposerConn *conn)
+walprop_finish(Safekeeper *sk)
 {
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
+	if (!sk->conn)
+		return;
+
+	if (sk->conn->recvbuf != NULL)
+		PQfreemem(sk->conn->recvbuf);
+	PQfinish(sk->conn->pg_conn);
+	pfree(sk->conn);
+	sk->conn = NULL;
 }

 /*
@@ -804,18 +810,18 @@ walprop_finish(WalProposerConn *conn)
 * to this function.
 */
 static PGAsyncReadResult
-walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 {
 	int			result;

-	if (conn->recvbuf != NULL)
+	if (sk->conn->recvbuf != NULL)
 	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
+		PQfreemem(sk->conn->recvbuf);
+		sk->conn->recvbuf = NULL;
 	}

 	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
+	if (!PQconsumeInput(sk->conn->pg_conn))
 	{
 		*amount = 0;
 		*buf = NULL;
@@ -833,7 +839,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
 	{
 		case 0:
 			*amount = 0;
@@ -848,7 +854,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));

 				if (status != PGRES_FATAL_ERROR)
 					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
@@ -869,18 +875,18 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
-			*buf = conn->recvbuf;
+			*buf = sk->conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }

 static PGAsyncWriteResult
-walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
+walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
 	int			result;

 	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(conn, true))
+	if (!ensure_nonblocking_status(sk->conn, true))
 		return PG_ASYNC_WRITE_FAIL;

 	/*
@@ -888,7 +894,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
 	 * error occurred
 	 */
-	result = PQputCopyData(conn->pg_conn, buf, size);
+	result = PQputCopyData(sk->conn->pg_conn, buf, size);

 	/*
 	 * We won't get a result of zero because walproposer always empties the
@@ -916,7 +922,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	 * sucessful, 1 if it was unable to send all the data in the send queue
 	 * yet -1 if it failed for some reason
 	 */
-	switch (result = PQflush(conn->pg_conn))
+	switch (result = PQflush(sk->conn->pg_conn))
 	{
 		case 0:
 			return PG_ASYNC_WRITE_SUCCESS;
@@ -934,22 +940,22 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 * information, refer to the comments there.
 */
 static bool
-walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
+walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 {
 	int			result;

 	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(conn, false))
+	if (!ensure_nonblocking_status(sk->conn, false))
 		return false;

-	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
+	if ((result = PQputCopyData(sk->conn->pg_conn, buf, size)) == -1)
 		return false;

 	Assert(result == 1);

 	/* Because the connection is non-blocking, flushing returns 0 or -1 */

-	if ((result = PQflush(conn->pg_conn)) == -1)
+	if ((result = PQflush(sk->conn->pg_conn)) == -1)
 		return false;

 	Assert(result == 0);
@@ -1381,11 +1387,11 @@ XLogWalPropClose(XLogRecPtr recptr)
 }

 static void
-walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count)
+walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
 {
 	WALReadError errinfo;

-	if (!WALRead(state,
+	if (!WALRead(sk->xlogreader,
 				 buf,
 				 startptr,
 				 count,
@@ -1396,31 +1402,38 @@ walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size
 	}
 }

-static XLogReaderState *
-walprop_pg_wal_reader_allocate(void)
+static void
+walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	return XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
+	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
+	if (sk->xlogreader == NULL)
+		elog(FATAL, "Failed to allocate xlog reader");
 }

 static WaitEventSet *waitEvents;

 static void
-walprop_pg_free_event_set(void)
+walprop_pg_free_event_set(WalProposer *wp)
 {
 	if (waitEvents)
 	{
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
+
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		wp->safekeeper[i].eventPos = -1;
+	}
 }

 static void
-walprop_pg_init_event_set(int n_safekeepers)
+walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");

-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
@@ -1439,11 +1452,11 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 static void
 walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
 {
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk->conn), NULL, sk);
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
 }

 static int
-walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
+walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events)
 {
 	WaitEvent	event = {0};
 	int			rc = 0;
@@ -1499,7 +1512,7 @@ walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
 }

 static void
-walprop_pg_finish_sync_safekeepers(XLogRecPtr lsn)
+walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 {
 	fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn));
 	exit(0);
@@ -1611,7 +1624,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 			 * pageserver.
 			 */
 								quorumFeedback.rf.disk_consistent_lsn,
-								walprop_pg_get_current_timestamp(), false);
+								walprop_pg_get_current_timestamp(wp), false);
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
@@ -1628,18 +1641,65 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 }

 static void
-walprop_pg_confirm_wal_streamed(XLogRecPtr lsn)
+walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
 {
 	if (MyReplicationSlot)
 		PhysicalConfirmReceivedLocation(lsn);
 }

+static XLogRecPtr
+walprop_pg_get_redo_start_lsn(WalProposer *wp)
+{
+	return GetRedoStartLsn();
+}
+
+static bool
+walprop_pg_strong_random(WalProposer *wp, void *buf, size_t len)
+{
+	return pg_strong_random(buf, len);
+}
+
+static void
+walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
+{
+	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
+}
+
+static void
+walprop_pg_after_election(WalProposer *wp)
+{
+	FILE* f;
+	XLogRecPtr lrRestartLsn;
+
+	/* We don't need to do anything in syncSafekeepers mode.*/
+	if (wp->config->syncSafekeepers)
+		return;
+
+	/*
+	 * If there are active logical replication subscription we need
+	 * to provide enough WAL for their WAL senders based on th position
+	 * of their replication slots.
+	 */
+	f = fopen("restart.lsn", "rb");
+	if (f != NULL && !wp->config->syncSafekeepers)
+	{
+		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		fclose(f);
+		if (lrRestartLsn != InvalidXLogRecPtr)
+		{
+			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
+			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
+			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
+			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
+		}
+	}
+}
+
 static const walproposer_api walprop_pg = {
 	.get_shmem_state = walprop_pg_get_shmem_state,
 	.start_streaming = walprop_pg_start_streaming,
 	.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
 	.get_current_timestamp = walprop_pg_get_current_timestamp,
-	.get_timeline_id = walprop_pg_get_timeline_id,
 	.conn_error_message = walprop_error_message,
 	.conn_status = walprop_status,
 	.conn_connect_start = walprop_connect_start,
@@ -1659,9 +1719,11 @@ static const walproposer_api walprop_pg = {
 	.update_event_set = walprop_pg_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
-	.strong_random = pg_strong_random,
-	.get_redo_start_lsn = GetRedoStartLsn,
+	.strong_random = walprop_pg_strong_random,
+	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
 	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
+	.log_internal = walprop_pg_log_internal,
+	.after_election = walprop_pg_after_election,
 };
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -6,6 +6,7 @@ pub use link::LinkAuthError;

 use crate::{
    auth::{self, ClientCredentials},
+    config::AuthenticationConfig,
    console::{
        self,
        provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -124,6 +125,7 @@ async fn auth_quirks(
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
@@ -145,7 +147,7 @@ async fn auth_quirks(
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client).await
+    classic::authenticate(api, extra, creds, client, config).await
 }

 impl BackendType<'_, ClientCredentials<'_>> {
@@ -180,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
        extra: &ConsoleReqExtra<'_>,
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
        allow_cleartext: bool,
+        config: &'static AuthenticationConfig,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

@@ -192,7 +195,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext).await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            Postgres(api, creds) => {
                info!(
@@ -202,7 +205,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext).await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,6 +4,7 @@ use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
+    config::AuthenticationConfig,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
    proxy::{handle_try_wake, retry_after},
    sasl, scram,
@@ -17,6 +18,7 @@ pub(super) async fn authenticate(
    extra: &ConsoleReqExtra<'_>,
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
@@ -42,7 +44,16 @@ pub(super) async fn authenticate(
                error
            })?;

-            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
+            let auth_outcome = tokio::time::timeout(
+                config.scram_protocol_timeout,
+                auth_flow.authenticate(),
+            )
+            .await
+            .map_err(|error| {
+                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
+                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
+            })?
+            .map_err(|error| {
                warn!(?error, "error processing scram messages");
                error
            })?;
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
@@ -83,7 +84,9 @@ struct ProxyCliArgs {
    /// timeout for http connections
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    sql_over_http_timeout: tokio::time::Duration,
-
+    /// timeout for scram authentication protocol
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    scram_protocol_timeout: tokio::time::Duration,
    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
@@ -231,12 +234,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    let http_config = HttpConfig {
        sql_over_http_timeout: args.sql_over_http_timeout,
    };
+    let authentication_config = AuthenticationConfig {
+        scram_protocol_timeout: args.scram_protocol_timeout,
+    };
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
+        authentication_config,
        require_client_ip: args.require_client_ip,
    }));

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,5 +1,5 @@
-use anyhow::{anyhow, Context};
-use hashbrown::HashMap;
+use anyhow::{bail, Context};
+use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::net::SocketAddr;
 use tokio::net::TcpStream;
@@ -8,7 +8,7 @@ use tracing::info;

 /// Enables serving `CancelRequest`s.
 #[derive(Default)]
-pub struct CancelMap(parking_lot::RwLock<HashMap<CancelKeyData, Option<CancelClosure>>>);
+pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);

 impl CancelMap {
    /// Cancel a running query for the corresponding connection.
@@ -16,7 +16,6 @@ impl CancelMap {
        // NB: we should immediately release the lock after cloning the token.
        let cancel_closure = self
            .0
-            .read()
            .get(&key)
            .and_then(|x| x.clone())
            .with_context(|| format!("query cancellation key not found: {key}"))?;
@@ -40,15 +39,19 @@ impl CancelMap {

        // Random key collisions are unlikely to happen here, but they're still possible,
        // which is why we have to take care not to rewrite an existing key.
-        self.0
-            .write()
-            .try_insert(key, None)
-            .map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;
+        match self.0.entry(key) {
+            dashmap::mapref::entry::Entry::Occupied(_) => {
+                bail!("query cancellation key already exists: {key}")
+            }
+            dashmap::mapref::entry::Entry::Vacant(e) => {
+                e.insert(None);
+            }
+        }

        // This will guarantee that the session gets dropped
        // as soon as the future is finished.
        scopeguard::defer! {
-            self.0.write().remove(&key);
+            self.0.remove(&key);
            info!("dropped query cancellation key {key}");
        }

@@ -59,12 +62,12 @@ impl CancelMap {

    #[cfg(test)]
    fn contains(&self, session: &Session) -> bool {
-        self.0.read().contains_key(&session.key)
+        self.0.contains_key(&session.key)
    }

    #[cfg(test)]
    fn is_empty(&self) -> bool {
-        self.0.read().is_empty()
+        self.0.is_empty()
    }
 }

@@ -113,10 +116,7 @@ impl Session<'_> {
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
-        self.cancel_map
-            .0
-            .write()
-            .insert(self.key, Some(cancel_closure));
+        self.cancel_map.0.insert(self.key, Some(cancel_closure));

        self.key
    }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -14,6 +14,7 @@ pub struct ProxyConfig {
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
+    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
 }

@@ -32,6 +33,10 @@ pub struct HttpConfig {
    pub sql_over_http_timeout: tokio::time::Duration,
 }

+pub struct AuthenticationConfig {
+    pub scram_protocol_timeout: tokio::time::Duration,
+}
+
 impl TlsConfig {
    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
        self.config.clone()
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -194,9 +194,10 @@ impl GlobalConnPool {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
            } else {
-                latency_timer.pool_hit();
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
+                latency_timer.pool_hit();
+                latency_timer.success();
                return Ok(Client {
                    inner: Some(client),
                    span: Span::current(),
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -5,7 +5,7 @@ use crate::{
    auth::{self, backend::AuthSuccess},
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
-    config::{ProxyConfig, TlsConfig},
+    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    http::StatusCode,
    metrics::{Ids, USAGE_METRICS},
@@ -96,7 +96,9 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "proxy_compute_connection_latency_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        &["protocol", "cache_miss", "pool_miss"],
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
        // largest bucket = 2^16 * 0.5ms = 32s
        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
@@ -105,19 +107,22 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {

 pub struct LatencyTimer {
    start: Instant,
-    pool_miss: bool,
-    cache_miss: bool,
    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
 }

 impl LatencyTimer {
    pub fn new(protocol: &'static str) -> Self {
        Self {
            start: Instant::now(),
+            protocol,
            cache_miss: false,
            // by default we don't do pooling
            pool_miss: true,
-            protocol,
+            // assume failed unless otherwise specified
+            outcome: "failed",
        }
    }

@@ -128,6 +133,10 @@ impl LatencyTimer {
    pub fn pool_hit(&mut self) {
        self.pool_miss = false;
    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
 }

 impl Drop for LatencyTimer {
@@ -138,6 +147,7 @@ impl Drop for LatencyTimer {
                self.protocol,
                bool_to_str(self.cache_miss),
                bool_to_str(self.pool_miss),
+                self.outcome,
            ])
            .observe(duration)
    }
@@ -340,7 +350,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        mode.allow_self_signed_compute(config),
    );
    cancel_map
-        .with_session(|session| client.connect_to_db(session, mode))
+        .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
        .await
 }

@@ -547,7 +557,10 @@ where

    // try once
    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => return Ok(res),
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
        Err(e) => {
            error!(error = ?e, "could not connect to compute node");
            (invalidate_cache(node_info), e)
@@ -601,7 +614,10 @@ where
    info!("wake_compute success. attempting to connect");
    loop {
        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => return Ok(res),
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
            Err(e) => {
                let retriable = e.should_retry(num_retries);
                if !retriable {
@@ -818,6 +834,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        self,
        session: cancellation::Session<'_>,
        mode: ClientMode,
+        config: &'static AuthenticationConfig,
    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
@@ -835,7 +852,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        let latency_timer = LatencyTimer::new(mode.protocol_label());

        let auth_result = match creds
-            .authenticate(&extra, &mut stream, mode.allow_cleartext())
+            .authenticate(&extra, &mut stream, mode.allow_cleartext(), config)
            .await
        {
            Ok(auth_result) => auth_result,
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -3,7 +3,7 @@
 //
 use anyhow::{bail, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::Parser;
+use clap::{ArgAction, Parser};
 use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
@@ -105,6 +105,9 @@ struct Args {
    /// it during this period passed as a human readable duration.
    #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
    heartbeat_timeout: Duration,
+    /// Enable/disable peer recovery.
+    #[arg(long, default_value = "false", action=ArgAction::Set)]
+    peer_recovery: bool,
    /// Remote storage configuration for WAL backup (offloading to s3) as TOML
    /// inline table, e.g.
    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
@@ -265,6 +268,7 @@ async fn main() -> anyhow::Result<()> {
        broker_endpoint: args.broker_endpoint,
        broker_keepalive_interval: args.broker_keepalive_interval,
        heartbeat_timeout: args.heartbeat_timeout,
+        peer_recovery_enabled: args.peer_recovery,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -372,6 +372,13 @@ impl SafekeeperPostgresHandler {
    /// from a walproposer recovery function. This connection gets a special handling:
    /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
    pub fn is_walproposer_recovery(&self) -> bool {
-        self.appname == Some("wal_proposer_recovery".to_string())
+        match &self.appname {
+            None => false,
+            Some(appname) => {
+                appname == "wal_proposer_recovery" ||
+                // set by safekeeper peer recovery
+                appname.starts_with("safekeeper")
+            }
+        }
    }
 }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -16,8 +16,8 @@ use tokio::io::AsyncReadExt;
 use utils::http::endpoint::request_span;

 use crate::receive_wal::WalReceiverState;
-use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
+use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
 use crate::{debug_dump, pull_timeline};
@@ -60,16 +60,25 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
        .as_ref()
 }

-/// Same as TermSwitchEntry, but serializes LSN using display serializer
+/// Same as TermLsn, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
 #[serde_as]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct TermSwitchApiEntry {
    pub term: Term,
    #[serde_as(as = "DisplayFromStr")]
    pub lsn: Lsn,
 }

+impl From<TermSwitchApiEntry> for TermLsn {
+    fn from(api_val: TermSwitchApiEntry) -> Self {
+        TermLsn {
+            term: api_val.term,
+            lsn: api_val.lsn,
+        }
+    }
+}
+
 /// Augment AcceptorState with epoch for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -62,6 +62,7 @@ pub struct SafeKeeperConf {
    pub broker_endpoint: Uri,
    pub broker_keepalive_interval: Duration,
    pub heartbeat_timeout: Duration,
+    pub peer_recovery_enabled: bool,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub max_offloader_lag_bytes: u64,
    pub backup_parallel_jobs: usize,
@@ -100,6 +101,7 @@ impl SafeKeeperConf {
                .parse()
                .expect("failed to parse default broker endpoint"),
            broker_keepalive_interval: Duration::from_secs(5),
+            peer_recovery_enabled: true,
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
            pg_auth: None,
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -55,9 +55,12 @@ impl WalReceivers {

    /// Register new walreceiver. Returned guard provides access to the slot and
    /// automatically deregisters in Drop.
-    pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
+    pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
        let slots = &mut self.mutex.lock().slots;
-        let walreceiver = WalReceiverState::Voting;
+        let walreceiver = WalReceiverState {
+            conn_id,
+            status: WalReceiverStatus::Voting,
+        };
        // find empty slot or create new one
        let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
            slots[pos] = Some(walreceiver);
@@ -96,6 +99,18 @@ impl WalReceivers {
        self.mutex.lock().slots.iter().flatten().cloned().collect()
    }

+    /// Get number of streaming walreceivers (normally 0 or 1) from compute.
+    pub fn get_num_streaming(self: &Arc<WalReceivers>) -> usize {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            // conn_id.is_none skips recovery which also registers here
+            .filter(|s| s.conn_id.is_some() && matches!(s.status, WalReceiverStatus::Streaming))
+            .count()
+    }
+
    /// Unregister walsender.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
@@ -108,10 +123,17 @@ struct WalReceiversShared {
    slots: Vec<Option<WalReceiverState>>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalReceiverState {
+    /// None means it is recovery initiated by us (this safekeeper).
+    pub conn_id: Option<ConnectionId>,
+    pub status: WalReceiverStatus,
+}
+
 /// Walreceiver status. Currently only whether it passed voting stage and
 /// started receiving the stream, but it is easy to add more if needed.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum WalReceiverState {
+pub enum WalReceiverStatus {
    Voting,
    Streaming,
 }
@@ -136,8 +158,8 @@ impl Drop for WalReceiverGuard {
    }
 }

-const MSG_QUEUE_SIZE: usize = 256;
-const REPLY_QUEUE_SIZE: usize = 16;
+pub const MSG_QUEUE_SIZE: usize = 256;
+pub const REPLY_QUEUE_SIZE: usize = 16;

 impl SafekeeperPostgresHandler {
    /// Wrapper around handle_start_wal_push_guts handling result. Error is
@@ -261,7 +283,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
            tli.clone(),
            msg_rx,
            reply_tx,
-            self.conn_id,
+            Some(self.conn_id),
        ));

        // Forward all messages to WalAcceptor
@@ -317,31 +339,41 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 // even when it writes a steady stream of messages.
 const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);

-/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
-struct WalAcceptor {
+/// Encapsulates a task which takes messages from msg_rx, processes and pushes
+/// replies to reply_tx; reading from socket and writing to disk in parallel is
+/// beneficial for performance, this struct provides writing to disk part.
+pub struct WalAcceptor {
    tli: Arc<Timeline>,
    msg_rx: Receiver<ProposerAcceptorMessage>,
    reply_tx: Sender<AcceptorProposerMessage>,
+    conn_id: Option<ConnectionId>,
 }

 impl WalAcceptor {
-    /// Spawn thread with WalAcceptor running, return handle to it.
-    fn spawn(
+    /// Spawn task with WalAcceptor running, return handle to it. Task returns
+    /// Ok(()) if either of channels has closed, and Err if any error during
+    /// message processing is encountered.
+    ///
+    /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
+    pub fn spawn(
        tli: Arc<Timeline>,
        msg_rx: Receiver<ProposerAcceptorMessage>,
        reply_tx: Sender<AcceptorProposerMessage>,
-        conn_id: ConnectionId,
+        conn_id: Option<ConnectionId>,
    ) -> JoinHandle<anyhow::Result<()>> {
        task::spawn(async move {
            let mut wa = WalAcceptor {
                tli,
                msg_rx,
                reply_tx,
+                conn_id,
            };

            let span_ttid = wa.tli.ttid; // satisfy borrow checker
            wa.run()
-                .instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
+                .instrument(
+                    info_span!("WAL acceptor", cid = %conn_id.unwrap_or(0), ttid = %span_ttid),
+                )
                .await
        })
    }
@@ -355,7 +387,7 @@ impl WalAcceptor {
        let _compute_conn_guard = ComputeConnectionGuard {
            timeline: Arc::clone(&self.tli),
        };
-        let walreceiver_guard = self.tli.get_walreceivers().register();
+        let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
        self.tli.update_status_notify().await?;

        // After this timestamp we will stop processing AppendRequests and send a response
@@ -372,7 +404,7 @@ impl WalAcceptor {

            // Update walreceiver state in shmem for reporting.
            if let ProposerAcceptorMessage::Elected(_) = &next_msg {
-                *walreceiver_guard.get() = WalReceiverState::Streaming;
+                walreceiver_guard.get().status = WalReceiverStatus::Streaming;
            }

            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -1,17 +1,41 @@
 //! This module implements pulling WAL from peer safekeepers if compute can't
 //! provide it, i.e. safekeeper lags too much.

-use std::sync::Arc;
+use std::time::SystemTime;
+use std::{fmt, pin::pin, sync::Arc};

-use tokio::{select, time::sleep, time::Duration};
-use tracing::{info, instrument};
+use anyhow::{bail, Context};
+use futures::StreamExt;
+use postgres_protocol::message::backend::ReplicationMessage;
+use tokio::sync::mpsc::{channel, Receiver, Sender};
+use tokio::time::timeout;
+use tokio::{
+    select,
+    time::sleep,
+    time::{self, Duration},
+};
+use tokio_postgres::replication::ReplicationStream;
+use tokio_postgres::types::PgLsn;
+use tracing::*;
+use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};

-use crate::{timeline::Timeline, SafeKeeperConf};
+use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
+use crate::safekeeper::{AppendRequest, AppendRequestHeader};
+use crate::{
+    http::routes::TimelineStatus,
+    receive_wal::MSG_QUEUE_SIZE,
+    safekeeper::{
+        AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
+        TermLsn, VoteRequest,
+    },
+    timeline::{PeerInfo, Timeline},
+    SafeKeeperConf,
+};

 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    info!("started");
    let mut cancellation_rx = match tli.get_cancellation_rx() {
        Ok(rx) => rx,
@@ -22,19 +46,387 @@ pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
    };

    select! {
-        _ = recovery_main_loop(tli) => { unreachable!() }
+        _ = recovery_main_loop(tli, conf) => { unreachable!() }
        _ = cancellation_rx.changed() => {
            info!("stopped");
        }
    }
 }

+/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
+/// fields to explain the choice.
+#[derive(Debug)]
+pub struct RecoveryNeededInfo {
+    /// my term
+    pub term: Term,
+    /// my last_log_term
+    pub last_log_term: Term,
+    /// my flush_lsn
+    pub flush_lsn: Lsn,
+    /// peers from which we can fetch WAL, for observability.
+    pub peers: Vec<PeerInfo>,
+    /// for observability
+    pub num_streaming_computes: usize,
+    pub donors: Vec<Donor>,
+}
+
+// Custom to omit not important fields from PeerInfo.
+impl fmt::Display for RecoveryNeededInfo {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{{")?;
+        write!(
+            f,
+            "term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
+            self.term, self.last_log_term, self.flush_lsn
+        )?;
+        for p in self.peers.iter() {
+            write!(
+                f,
+                "PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
+                p.sk_id, p.term, p.last_log_term, p.flush_lsn
+            )?;
+        }
+        write!(
+            f,
+            "}} num_streaming_computes: {}, donors: {:?}",
+            self.num_streaming_computes, self.donors
+        )
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Donor {
+    pub sk_id: NodeId,
+    /// equals to last_log_term
+    pub term: Term,
+    pub flush_lsn: Lsn,
+    pub pg_connstr: String,
+    pub http_connstr: String,
+}
+
+impl From<&PeerInfo> for Donor {
+    fn from(p: &PeerInfo) -> Self {
+        Donor {
+            sk_id: p.sk_id,
+            term: p.term,
+            flush_lsn: p.flush_lsn,
+            pg_connstr: p.pg_connstr.clone(),
+            http_connstr: p.http_connstr.clone(),
+        }
+    }
+}
+
 const CHECK_INTERVAL_MS: u64 = 2000;

 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(_tli: Arc<Timeline>) {
+async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
    loop {
+        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+        match recovery_needed_info.donors.first() {
+            Some(donor) => {
+                info!(
+                    "starting recovery from donor {}: {}",
+                    donor.sk_id, recovery_needed_info
+                );
+                match recover(tli.clone(), donor, &conf).await {
+                    // Note: 'write_wal rewrites WAL written before' error is
+                    // expected here and might happen if compute and recovery
+                    // concurrently write the same data. Eventually compute
+                    // should win.
+                    Err(e) => warn!("recovery failed: {:#}", e),
+                    Ok(msg) => info!("recovery finished: {}", msg),
+                }
+            }
+            None => {
+                trace!(
+                    "recovery not needed or not possible: {}",
+                    recovery_needed_info
+                );
+            }
+        }
        sleep(check_duration).await;
    }
 }
+
+/// Recover from the specified donor. Returns message explaining normal finish
+/// reason or error.
+async fn recover(
+    tli: Arc<Timeline>,
+    donor: &Donor,
+    conf: &SafeKeeperConf,
+) -> anyhow::Result<String> {
+    // Learn donor term switch history to figure out starting point.
+    let client = reqwest::Client::new();
+    let timeline_info: TimelineStatus = client
+        .get(format!(
+            "http://{}/v1/tenant/{}/timeline/{}",
+            donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
+        ))
+        .send()
+        .await?
+        .json()
+        .await?;
+    if timeline_info.acceptor_state.term != donor.term {
+        bail!(
+            "donor term changed from {} to {}",
+            donor.term,
+            timeline_info.acceptor_state.term
+        );
+    }
+    // convert from API TermSwitchApiEntry into TermLsn.
+    let donor_th = TermHistory(
+        timeline_info
+            .acceptor_state
+            .term_history
+            .iter()
+            .map(|tl| Into::<TermLsn>::into(*tl))
+            .collect(),
+    );
+
+    // Now understand our term history.
+    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
+    let vote_response = match tli
+        .process_msg(&vote_request)
+        .await
+        .context("VoteRequest handling")?
+    {
+        Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
+        _ => {
+            bail!("unexpected VoteRequest response"); // unreachable
+        }
+    };
+    if vote_response.term != donor.term {
+        bail!(
+            "our term changed from {} to {}",
+            donor.term,
+            vote_response.term
+        );
+    }
+
+    let last_common_point = match TermHistory::find_highest_common_point(
+        &donor_th,
+        &vote_response.term_history,
+        vote_response.flush_lsn,
+    ) {
+        None => bail!(
+            "couldn't find common point in histories, donor {:?}, sk {:?}",
+            donor_th,
+            vote_response.term_history,
+        ),
+        Some(lcp) => lcp,
+    };
+    info!("found last common point at {:?}", last_common_point);
+
+    // truncate WAL locally
+    let pe = ProposerAcceptorMessage::Elected(ProposerElected {
+        term: donor.term,
+        start_streaming_at: last_common_point.lsn,
+        term_history: donor_th,
+        timeline_start_lsn: Lsn::INVALID,
+    });
+    // Successful ProposerElected handling always returns None. If term changed,
+    // we'll find out that during the streaming. Note: it is expected to get
+    // 'refusing to overwrite correct WAL' here if walproposer reconnected
+    // concurrently, restart helps here.
+    tli.process_msg(&pe)
+        .await
+        .context("ProposerElected handling")?;
+
+    recovery_stream(tli, donor, last_common_point.lsn, conf).await
+}
+
+// Pull WAL from donor, assuming handshake is already done.
+async fn recovery_stream(
+    tli: Arc<Timeline>,
+    donor: &Donor,
+    start_streaming_at: Lsn,
+    conf: &SafeKeeperConf,
+) -> anyhow::Result<String> {
+    // TODO: pass auth token
+    let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
+    let mut cfg = cfg.to_tokio_postgres_config();
+    // It will make safekeeper give out not committed WAL (up to flush_lsn).
+    cfg.application_name(&format!("safekeeper_{}", conf.my_id));
+    cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
+
+    let connect_timeout = Duration::from_millis(10000);
+    let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
+        .await
+    {
+        Ok(client_and_conn) => client_and_conn?,
+        Err(_elapsed) => {
+            bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
+        }
+    };
+    trace!("connected to {:?}", donor);
+
+    // The connection object performs the actual communication with the
+    // server, spawn it off to run on its own.
+    let ttid = tli.ttid;
+    tokio::spawn(async move {
+        if let Err(e) = connection
+            .instrument(info_span!("recovery task connection poll", ttid = %ttid))
+            .await
+        {
+            // This logging isn't very useful as error is anyway forwarded to client.
+            trace!(
+                "tokio_postgres connection object finished with error: {}",
+                e
+            );
+        }
+    });
+
+    let query = format!(
+        "START_REPLICATION PHYSICAL {} (term='{}')",
+        start_streaming_at, donor.term
+    );
+
+    let copy_stream = client.copy_both_simple(&query).await?;
+    let physical_stream = ReplicationStream::new(copy_stream);
+
+    // As in normal walreceiver, do networking and writing to disk in parallel.
+    let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
+    let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
+    let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
+
+    let res = tokio::select! {
+        r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
+        r = read_replies(reply_rx, donor.term) => r.map(|()| None),
+    };
+
+    // Join the spawned WalAcceptor. At this point chans to/from it passed to
+    // network routines are dropped, so it will exit as soon as it touches them.
+    match wa.await {
+        Ok(Ok(())) => {
+            // WalAcceptor finished normally, termination reason is different
+            match res {
+                Ok(Some(success_desc)) => Ok(success_desc),
+                Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
+                Err(e) => Err(e), // network error or term change
+            }
+        }
+        Ok(Err(e)) => Err(e), // error while processing message
+        Err(e) => bail!("WalAcceptor panicked: {}", e),
+    }
+}
+
+// Perform network part of streaming: read data and push it to msg_tx, send KA
+// to make sender hear from us. If there is nothing coming for a while, check
+// for termination.
+// Returns
+// - Ok(None) if channel to WalAcceptor closed -- its task should return error.
+// - Ok(Some(String)) if recovery successfully completed.
+// - Err if error happened while reading/writing to socket.
+async fn network_io(
+    physical_stream: ReplicationStream,
+    msg_tx: Sender<ProposerAcceptorMessage>,
+    donor: Donor,
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+) -> anyhow::Result<Option<String>> {
+    let mut physical_stream = pin!(physical_stream);
+    let mut last_received_lsn = Lsn::INVALID;
+    // tear down connection if no data arrives withing this period
+    let no_data_timeout = Duration::from_millis(30000);
+
+    loop {
+        let msg = match timeout(no_data_timeout, physical_stream.next()).await {
+            Ok(next) => match next {
+                None => bail!("unexpected end of replication stream"),
+                Some(msg) => msg.context("get replication message")?,
+            },
+            Err(_) => bail!("no message received within {:?}", no_data_timeout),
+        };
+
+        match msg {
+            ReplicationMessage::XLogData(xlog_data) => {
+                let ar_hdr = AppendRequestHeader {
+                    term: donor.term,
+                    epoch_start_lsn: Lsn::INVALID, // unused
+                    begin_lsn: Lsn(xlog_data.wal_start()),
+                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
+                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
+                    truncate_lsn: Lsn::INVALID, // do not attempt to advance
+                    proposer_uuid: [0; 16],
+                };
+                let ar = AppendRequest {
+                    h: ar_hdr,
+                    wal_data: xlog_data.into_data(),
+                };
+                trace!(
+                    "processing AppendRequest {}-{}, len {}",
+                    ar.h.begin_lsn,
+                    ar.h.end_lsn,
+                    ar.wal_data.len()
+                );
+                last_received_lsn = ar.h.end_lsn;
+                if msg_tx
+                    .send(ProposerAcceptorMessage::AppendRequest(ar))
+                    .await
+                    .is_err()
+                {
+                    return Ok(None); // chan closed, WalAcceptor terminated
+                }
+            }
+            ReplicationMessage::PrimaryKeepAlive(_) => {
+                // keepalive means nothing is being streamed for a while. Check whether we need to stop.
+                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+                // do current donors still contain one we currently connected to?
+                if !recovery_needed_info
+                    .donors
+                    .iter()
+                    .any(|d| d.sk_id == donor.sk_id)
+                {
+                    // Most likely it means we are caughtup.
+                    // note: just exiting makes tokio_postgres send CopyFail to the far end.
+                    return Ok(Some(format!(
+                        "terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
+                        last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
+                    )));
+                }
+            }
+            _ => {}
+        }
+        // Send reply to each message to keep connection alive. Ideally we
+        // should do that once in a while instead, but this again requires
+        // stream split or similar workaround, and recovery is anyway not that
+        // performance critical.
+        //
+        // We do not know here real write/flush LSNs (need to take mutex again
+        // or check replies which are read in different future), but neither
+        // sender much cares about them, so just send last received.
+        physical_stream
+            .as_mut()
+            .standby_status_update(
+                PgLsn::from(last_received_lsn.0),
+                PgLsn::from(last_received_lsn.0),
+                PgLsn::from(last_received_lsn.0),
+                SystemTime::now(),
+                0,
+            )
+            .await?;
+    }
+}
+
+// Read replies from WalAcceptor. We are not interested much in sending them to
+// donor safekeeper, so don't route them anywhere. However, we should check if
+// term changes and exit if it does.
+// Returns Ok(()) if channel closed, Err in case of term change.
+async fn read_replies(
+    mut reply_rx: Receiver<AcceptorProposerMessage>,
+    donor_term: Term,
+) -> anyhow::Result<()> {
+    loop {
+        match reply_rx.recv().await {
+            Some(msg) => {
+                if let AcceptorProposerMessage::AppendResponse(ar) = msg {
+                    if ar.term != donor_term {
+                        bail!("donor term changed from {} to {}", donor_term, ar.term);
+                    }
+                }
+            }
+            None => return Ok(()), // chan closed, WalAcceptor terminated
+        }
+    }
+}
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -91,6 +91,69 @@ impl TermHistory {
        }
        TermHistory(res)
    }
+
+    /// Find point of divergence between leader (walproposer) term history and
+    /// safekeeper. Arguments are not symmetrics as proposer history ends at
+    /// +infinity while safekeeper at flush_lsn.
+    /// C version is at walproposer SendProposerElected.
+    pub fn find_highest_common_point(
+        prop_th: &TermHistory,
+        sk_th: &TermHistory,
+        sk_wal_end: Lsn,
+    ) -> Option<TermLsn> {
+        let (prop_th, sk_th) = (&prop_th.0, &sk_th.0); // avoid .0 below
+
+        if let Some(sk_th_last) = sk_th.last() {
+            assert!(
+                sk_th_last.lsn <= sk_wal_end,
+                "safekeeper term history end {:?} LSN is higher than WAL end {:?}",
+                sk_th_last,
+                sk_wal_end
+            );
+        }
+
+        // find last common term, if any...
+        let mut last_common_idx = None;
+        for i in 0..min(sk_th.len(), prop_th.len()) {
+            if prop_th[i].term != sk_th[i].term {
+                break;
+            }
+            // If term is the same, LSN must be equal as well.
+            assert!(
+                prop_th[i].lsn == sk_th[i].lsn,
+                "same term {} has different start LSNs: prop {}, sk {}",
+                prop_th[i].term,
+                prop_th[i].lsn,
+                sk_th[i].lsn
+            );
+            last_common_idx = Some(i);
+        }
+        let last_common_idx = match last_common_idx {
+            None => return None, // no common point
+            Some(lci) => lci,
+        };
+        // Now find where it ends at both prop and sk and take min. End of
+        // (common) term is the start of the next except it is the last one;
+        // there it is flush_lsn in case of safekeeper or, in case of proposer
+        // +infinity, so we just take flush_lsn then.
+        if last_common_idx == prop_th.len() - 1 {
+            Some(TermLsn {
+                term: prop_th[last_common_idx].term,
+                lsn: sk_wal_end,
+            })
+        } else {
+            let prop_common_term_end = prop_th[last_common_idx + 1].lsn;
+            let sk_common_term_end = if last_common_idx + 1 < sk_th.len() {
+                sk_th[last_common_idx + 1].lsn
+            } else {
+                sk_wal_end
+            };
+            Some(TermLsn {
+                term: prop_th[last_common_idx].term,
+                lsn: min(prop_common_term_end, sk_common_term_end),
+            })
+        }
+    }
 }

 /// Display only latest entries for Debug.
@@ -305,19 +368,19 @@ pub struct AcceptorGreeting {
 /// Vote request sent from proposer to safekeepers
 #[derive(Debug, Deserialize)]
 pub struct VoteRequest {
-    term: Term,
+    pub term: Term,
 }

 /// Vote itself, sent from safekeeper to proposer
 #[derive(Debug, Serialize)]
 pub struct VoteResponse {
-    term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
+    pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    vote_given: u64, // fixme u64 due to padding
    // Safekeeper flush_lsn (end of WAL) + history of term switches allow
    // proposer to choose the most advanced one.
-    flush_lsn: Lsn,
+    pub flush_lsn: Lsn,
    truncate_lsn: Lsn,
-    term_history: TermHistory,
+    pub term_history: TermHistory,
    timeline_start_lsn: Lsn,
 }

@@ -344,7 +407,8 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
-    // LSN since the proposer appends WAL; determines epoch switch point.
+    // TODO: remove this field, it in unused -- LSN of term switch can be taken
+    // from ProposerElected (as well as from term history).
    pub epoch_start_lsn: Lsn,
    /// start position of message in WAL
    pub begin_lsn: Lsn,
@@ -759,7 +823,7 @@ where
            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                   msg.term, self.flush_lsn(), msg.start_streaming_at)
        }
-        // Otherwise this shouldn't happen.
+        // Otherwise we must never attempt to truncate committed data.
        assert!(
            msg.start_streaming_at >= self.inmem.commit_lsn,
            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
@@ -810,6 +874,14 @@ where

        info!("start receiving WAL since {:?}", msg.start_streaming_at);

+        // Cache LSN where term starts to immediately fsync control file with
+        // commit_lsn once we reach it -- sync-safekeepers finishes when
+        // persisted commit_lsn on majority of safekeepers aligns.
+        self.epoch_start_lsn = match msg.term_history.0.last() {
+            None => bail!("proposer elected with empty term history"),
+            Some(term_lsn_start) => term_lsn_start.lsn,
+        };
+
        Ok(None)
    }

@@ -835,10 +907,7 @@ where
        // file: walproposer in sync mode is very interested when this
        // happens. Note: this is for sync-safekeepers mode only, as
        // otherwise commit_lsn might jump over epoch_start_lsn.
-        // Also note that commit_lsn can reach epoch_start_lsn earlier
-        // that we receive new epoch_start_lsn, and we still need to sync
-        // control file in this case.
-        if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
+        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
            self.persist_control_file(self.state.clone()).await?;
        }

@@ -902,7 +971,6 @@ where
        // Now we know that we are in the same term as the proposer,
        // processing the message.

-        self.epoch_start_lsn = msg.h.epoch_start_lsn;
        self.inmem.proposer_uuid = msg.h.proposer_uuid;

        // do the job
@@ -1185,4 +1253,65 @@ mod tests {
        sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
        assert_eq!(sk.get_epoch(), 1);
    }
+
+    #[test]
+    fn test_find_highest_common_point_none() {
+        let prop_th = TermHistory(vec![(0, Lsn(1)).into()]);
+        let sk_th = TermHistory(vec![(1, Lsn(1)).into(), (2, Lsn(2)).into()]);
+        assert_eq!(
+            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(3),),
+            None
+        );
+    }
+
+    #[test]
+    fn test_find_highest_common_point_middle() {
+        let prop_th = TermHistory(vec![
+            (1, Lsn(10)).into(),
+            (2, Lsn(20)).into(),
+            (4, Lsn(40)).into(),
+        ]);
+        let sk_th = TermHistory(vec![
+            (1, Lsn(10)).into(),
+            (2, Lsn(20)).into(),
+            (3, Lsn(30)).into(), // sk ends last common term 2 at 30
+        ]);
+        assert_eq!(
+            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(40),),
+            Some(TermLsn {
+                term: 2,
+                lsn: Lsn(30),
+            })
+        );
+    }
+
+    #[test]
+    fn test_find_highest_common_point_sk_end() {
+        let prop_th = TermHistory(vec![
+            (1, Lsn(10)).into(),
+            (2, Lsn(20)).into(), // last common term 2, sk will end it at 32 sk_end_lsn
+            (4, Lsn(40)).into(),
+        ]);
+        let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
+        assert_eq!(
+            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
+            Some(TermLsn {
+                term: 2,
+                lsn: Lsn(32),
+            })
+        );
+    }
+
+    #[test]
+    fn test_find_highest_common_point_walprop() {
+        let prop_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
+        let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
+        assert_eq!(
+            TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
+            Some(TermLsn {
+                term: 2,
+                lsn: Lsn(32),
+            })
+        );
+    }
 }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -418,10 +418,11 @@ impl SafekeeperPostgresHandler {
        }

        info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}",
+            "starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
            start_pos,
            end_pos,
-            matches!(end_watch, EndWatch::Flush(_))
+            matches!(end_watch, EndWatch::Flush(_)),
+            appname
        );

        // switch to copy
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -11,6 +11,7 @@ use tokio::fs;
 use serde_with::DisplayFromStr;
 use std::cmp::max;
 use std::sync::Arc;
+use std::time::Duration;
 use tokio::sync::{Mutex, MutexGuard};
 use tokio::{
    sync::{mpsc::Sender, watch},
@@ -27,7 +28,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
-use crate::recovery::recovery_main;
+use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
    SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
@@ -45,11 +46,12 @@ use crate::{debug_dump, wal_storage};
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PeerInfo {
    pub sk_id: NodeId,
+    pub term: Term,
    /// Term of the last entry.
-    _last_log_term: Term,
+    pub last_log_term: Term,
    /// LSN of the last record.
    #[serde_as(as = "DisplayFromStr")]
-    _flush_lsn: Lsn,
+    pub flush_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
@@ -61,16 +63,21 @@ pub struct PeerInfo {
    #[serde(skip)]
    #[serde(default = "Instant::now")]
    ts: Instant,
+    pub pg_connstr: String,
+    pub http_connstr: String,
 }

 impl PeerInfo {
    fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
        PeerInfo {
            sk_id: NodeId(sk_info.safekeeper_id),
-            _last_log_term: sk_info.last_log_term,
-            _flush_lsn: Lsn(sk_info.flush_lsn),
+            term: sk_info.term,
+            last_log_term: sk_info.last_log_term,
+            flush_lsn: Lsn(sk_info.flush_lsn),
            commit_lsn: Lsn(sk_info.commit_lsn),
            local_start_lsn: Lsn(sk_info.local_start_lsn),
+            pg_connstr: sk_info.safekeeper_connstr.clone(),
+            http_connstr: sk_info.http_connstr.clone(),
            ts,
        }
    }
@@ -262,6 +269,20 @@ impl SharedState {
            availability_zone: conf.availability_zone.clone(),
        }
    }
+
+    /// Get our latest view of alive peers status on the timeline.
+    /// We pass our own info through the broker as well, so when we don't have connection
+    /// to the broker returned vec is empty.
+    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
+        let now = Instant::now();
+        self.peers_info
+            .0
+            .iter()
+            // Regard peer as absent if we haven't heard from it within heartbeat_timeout.
+            .filter(|p| now.duration_since(p.ts) <= heartbeat_timeout)
+            .cloned()
+            .collect()
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -443,7 +464,9 @@ impl Timeline {
    /// Bootstrap new or existing timeline starting background stasks.
    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
        // Start recovery task which always runs on the timeline.
-        tokio::spawn(recovery_main(self.clone(), conf.clone()));
+        if conf.peer_recovery_enabled {
+            tokio::spawn(recovery_main(self.clone(), conf.clone()));
+        }
    }

    /// Delete timeline from disk completely, by removing timeline directory. Background
@@ -677,20 +700,88 @@ impl Timeline {
        Ok(())
    }

-    /// Get our latest view of alive peers status on the timeline.
-    /// We pass our own info through the broker as well, so when we don't have connection
-    /// to the broker returned vec is empty.
    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
        let shared_state = self.write_shared_state().await;
-        let now = Instant::now();
-        shared_state
-            .peers_info
-            .0
-            .iter()
-            // Regard peer as absent if we haven't heard from it within heartbeat_timeout.
-            .filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
-            .cloned()
-            .collect()
+        shared_state.get_peers(conf.heartbeat_timeout)
+    }
+
+    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
+    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+    /// to fetch, and we can do that without running elections; 2) there is no
+    /// actively streaming compute, as we don't want to compete with it.
+    ///
+    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+    /// to its last_log_term so we are sure such a leader ever had been elected.
+    ///
+    /// All possible donors are returned so that we could keep connection to the
+    /// current one if it is good even if it slightly lags behind.
+    ///
+    /// Note that term conditions above might be not met, but safekeepers are
+    /// still not aligned on last flush_lsn. Generally in this case until
+    /// elections are run it is not possible to say which safekeeper should
+    /// recover from which one -- history which would be committed is different
+    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+    /// Thus we don't try to predict it here.
+    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
+        let ss = self.write_shared_state().await;
+        let term = ss.sk.state.acceptor_state.term;
+        let last_log_term = ss.sk.get_epoch();
+        let flush_lsn = ss.sk.flush_lsn();
+        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+        let mut peers = ss.get_peers(heartbeat_timeout);
+        // Sort by <last log term, lsn> pairs.
+        peers.sort_by(|p1, p2| {
+            let tl1 = TermLsn {
+                term: p1.last_log_term,
+                lsn: p1.flush_lsn,
+            };
+            let tl2 = TermLsn {
+                term: p2.last_log_term,
+                lsn: p2.flush_lsn,
+            };
+            tl2.cmp(&tl1) // desc
+        });
+        let num_streaming_computes = self.walreceivers.get_num_streaming();
+        let donors = if num_streaming_computes > 0 {
+            vec![] // If there is a streaming compute, don't try to recover to not intervene.
+        } else {
+            peers
+                .iter()
+                .filter_map(|candidate| {
+                    // Are we interested in this candidate?
+                    let candidate_tl = TermLsn {
+                        term: candidate.last_log_term,
+                        lsn: candidate.flush_lsn,
+                    };
+                    let my_tl = TermLsn {
+                        term: last_log_term,
+                        lsn: flush_lsn,
+                    };
+                    if my_tl < candidate_tl {
+                        // Yes, we are interested. Can we pull from it without
+                        // (re)running elections? It is possible if 1) his term
+                        // is equal to his last_log_term so we could act on
+                        // behalf of leader of this term (we must be sure he was
+                        // ever elected) and 2) our term is not higher, or we'll refuse data.
+                        if candidate.term == candidate.last_log_term && candidate.term >= term {
+                            Some(Donor::from(candidate))
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .collect()
+        };
+        RecoveryNeededInfo {
+            term,
+            last_log_term,
+            flush_lsn,
+            peers,
+            num_streaming_computes,
+            donors,
+        }
    }

    pub fn get_walsenders(&self) -> &Arc<WalSenders> {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -60,7 +60,6 @@ from fixtures.utils import (
    allure_attach_from_dir,
    get_self_dir,
    subprocess_capture,
-    wait_until,
 )

 """
@@ -1632,7 +1631,7 @@ class NeonPageserver(PgProtocol):
            ".*took more than expected to complete.*",
            # these can happen during shutdown, but it should not be a reason to fail a test
            ".*completed, took longer than expected.*",
-            '.*registered custom resource manager "neon".*',
+            '.*registered custom resource manager \\\\"neon\\\\".*',
            # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
            # and it is not a failure of our code when it happens.
            ".*DeleteObjects.*We encountered an internal error. Please try again.*",
@@ -1681,41 +1680,6 @@ class NeonPageserver(PgProtocol):
            self.running = False
        return self

-    def restart(self, immediate: bool = False):
-        """
-        High level wrapper for restart: restarts the process, and waits for
-        tenant state to stabilize.
-        """
-        self.stop(immediate=immediate)
-        self.start()
-        self.quiesce_tenants()
-
-    def quiesce_tenants(self):
-        """
-        Wait for all tenants to enter a stable state (Active or Broken)
-
-        Call this after restarting the pageserver, or after attaching a tenant,
-        to ensure that it is ready for use.
-        """
-
-        stable_states = {"Active", "Broken"}
-
-        client = self.http_client()
-
-        def complete():
-            log.info("Checking tenants...")
-            tenants = client.tenant_list()
-            tenants = client.tenant_list()
-            log.info(f"Tenant list: {tenants}...")
-            any_unstable = any((t["state"]["slug"] not in stable_states) for t in tenants)
-            if any_unstable:
-                for t in tenants:
-                    log.info(f"Waiting for tenant {t['id']} in state {t['state']['slug']}")
-            log.info(f"any_unstable={any_unstable}")
-            assert not any_unstable
-
-        wait_until(20, 0.5, complete)
-
    def __enter__(self) -> "NeonPageserver":
        return self

@@ -2794,6 +2758,20 @@ class Safekeeper:
    def data_dir(self) -> str:
        return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")

+    def timeline_dir(self, tenant_id, timeline_id) -> str:
+        return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
+
+    def list_segments(self, tenant_id, timeline_id) -> List[str]:
+        """
+        Get list of segment names of the given timeline.
+        """
+        tli_dir = self.timeline_dir(tenant_id, timeline_id)
+        segments = []
+        for _, _, filenames in os.walk(tli_dir):
+            segments.extend([f for f in filenames if f != "safekeeper.control"])
+        segments.sort()
+        return segments
+

@dataclass
 class SafekeeperTimelineStatus:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -157,7 +157,7 @@ def wait_for_last_record_lsn(
    lsn: Lsn,
 ) -> Lsn:
    """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(100):
+    for i in range(1000000):
        current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
        if current_lsn >= lsn:
            return current_lsn
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,5 +1,8 @@
+import pytest
+import os
 import shutil
 from contextlib import closing
+from fixtures.log_helper import log

 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pg_version import PgVersion
@@ -18,6 +21,9 @@ from fixtures.pg_version import PgVersion
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

+    # Number of times to run the write query. One run creates 350MB of wal.
+    n_writes = 10
+
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table huge (i int, j int);")
@@ -25,7 +31,10 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
            # Run INSERT, recording the time and I/O it takes
            with env.record_pageserver_writes("pageserver_writes"):
                with env.record_duration("insert"):
-                    cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
+                    for i in range(n_writes):
+                        if n_writes > 1:
+                            log.info(f"running query {i}/{n_writes}")
+                        cur.execute("insert into huge values (generate_series(1, 5000000), 0);")
                    env.flush()

            env.report_peak_memory_use()
@@ -39,7 +48,9 @@ def test_bulk_insert(neon_with_baseline: PgCompare):


 def measure_recovery_time(env: NeonCompare):
-    client = env.env.pageserver.http_client()
+    # Hmm why is pageserver less ready to respond to http when the datadir is large?
+    from urllib3.util.retry import Retry
+    client = env.env.pageserver.http_client(retries=Retry(1000))
    pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])

    # Stop pageserver and remove tenant data
@@ -57,3 +68,13 @@ def measure_recovery_time(env: NeonCompare):

        # Flush, which will also wait for lsn to catch up
        env.flush()
+
+
+# This test is meant for local iteration only. The use case is when you want to re-run
+# the measure_recovery_time part of test_bulk_insert, but without running the setup.
+# It allows you to iterate on results 2x faster while trying to improve wal ingestion
+# performance.
+@pytest.mark.skip("this is a convenience test for local dev only")
+def test_recovery(neon_env_builder):
+    env = neon_env_builder.init_start()
+    measure_recovery_time(env)
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -9,8 +9,8 @@ publish = false
 [dependencies]
 native-tls = "0.2.11"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.28", features=["rt", "macros"] }
-tokio-postgres = "0.7.8"
+tokio = { version = "1.33", features=["rt", "macros"] }
+tokio-postgres = "0.7.10"


 # This is not part of the main 'neon' workspace
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.70
+FROM rust:1.73
 WORKDIR /source

 COPY . .
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -333,30 +333,16 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
    env = neon_env_builder.init_configs()
    env.start()

-    env.pageserver.allowed_errors.extend(
-        [
-            ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
-            ".*Failed to load index_part from remote storage.*",
-            # On a fast restart, there may be an initdb still running in a basebackup...__temp directory
-            ".*Failed to purge.*Directory not empty.*",
-        ]
+    env.pageserver.allowed_errors.append(
+        ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
    )
    ps_http = env.pageserver.http_client()

    # pause all uploads
+    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
    ps_http.tenant_create(env.initial_tenant)

-    # Create a timeline whose creation will succeed.  The tenant will need at least one
-    # timeline to be loadable.
-    success_timeline = TimelineId.generate()
-    log.info(f"Creating timeline {success_timeline}")
-    ps_http.timeline_create(env.pg_version, env.initial_tenant, success_timeline, timeout=60)
-
-    # Create a timeline whose upload to remote storage will be blocked
-    ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
-
    def start_creating_timeline():
-        log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
        with pytest.raises(RequestException):
            ps_http.timeline_create(
                env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
@@ -380,9 +366,6 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
    with pytest.raises(PageserverApiException, match="not found"):
        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)

-    # The one successfully created timeline should still be there.
-    assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
-

 def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
    """
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -15,7 +15,7 @@ from fixtures.types import TenantId, TimelineId

 # Test restarting page server, while safekeeper and compute node keep
 # running.
-def test_local_corruption(neon_env_builder: NeonEnvBuilder):
+def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    env.pageserver.allowed_errors.extend(
@@ -69,19 +69,24 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):

    env.pageserver.start()

-    # Un-damaged tenant works
+    # Tenant 0 should still work
    pg0.start()
    assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100

-    # Tenant with corrupt local metadata works: remote storage is authoritative for metadata
-    pg1.start()
-    assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
+    # But all others are broken
+
+    # First timeline would not get loaded into pageserver due to corrupt metadata file
+    with pytest.raises(
+        Exception, match=f"Tenant {tenant1} will not become active. Current state: Broken"
+    ) as err:
+        pg1.start()
+    log.info(
+        f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
+    )

    # Second timeline will fail during basebackup, because the local layer file is corrupt.
    # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
    # (We don't check layer file contents on startup, when loading the timeline)
-    #
-    # This will change when we implement checksums for layers
    with pytest.raises(Exception, match="Failed to load delta layer") as err:
        pg2.start()
    log.info(
@@ -128,7 +133,8 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
        _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id)

    # Restart the page server
-    env.pageserver.restart(immediate=True)
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()

    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
    new_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -157,6 +157,8 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
        tenant_id, timeline_id = env.neon_cli.create_tenant()
        endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)

+        # insert something to force sk -> ps message
+        endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
        # Wait to make sure that we get a latest WAL receiver data.
        # We need to wait here because it's possible that we don't have access to
        # the latest WAL yet, when the `timeline_detail` API is first called.
@@ -168,7 +170,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
        )

        # Make a DB modification then expect getting a new WAL receiver's data.
-        endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
+        endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
        wait_until(
            number_of_iterations=5,
            interval=1,
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -62,14 +62,14 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    tenant_load_delay_ms = 5000
    env.pageserver.stop()
    env.pageserver.start(
-        extra_env_vars={"FAILPOINTS": f"before-attaching-tenant=return({tenant_load_delay_ms})"}
+        extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
    )

-    # Check that it's in Attaching state
+    # Check that it's in Loading state
    client = env.pageserver.http_client()
    tenant_status = client.tenant_status(env.initial_tenant)
    log.info("Tenant status : %s", tenant_status)
-    assert tenant_status["state"]["slug"] == "Attaching"
+    assert tenant_status["state"]["slug"] == "Loading"

    # Try to read. This waits until the loading finishes, and then return normally.
    cur.execute("SELECT count(*) FROM foo")
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -241,7 +241,8 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
        assert reason.endswith(f"failpoint: {failpoint}"), reason

    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.restart()
+        env.pageserver.stop()
+        env.pageserver.start()

        if failpoint in (
            "tenant-delete-before-shutdown",
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -66,6 +66,10 @@ def test_tenant_reattach(
    env.pageserver.allowed_errors.append(
        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
    )
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
@@ -112,7 +116,7 @@ def test_tenant_reattach(
            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000

        # Check that we had to retry the downloads
-        assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
        assert env.pageserver.log_contains(".*download.*failed, will retry.*")


@@ -639,6 +643,47 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
    ensure_test_data(data_id, data_secret, endpoint)


+# Tests that it's possible to `load` broken tenants:
+# * `ignore` a tenant
+# * removes its `metadata` file locally
+# * `load` the same tenant
+# * ensure that it's status is `Broken`
+def test_ignored_tenant_stays_broken_without_metadata(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+    env.endpoints.create_start("main")
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: (Broken|Stopping).*"
+    )
+
+    # ignore the tenant and remove its metadata
+    pageserver_http.tenant_ignore(tenant_id)
+    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    metadata_removed = False
+    for dir_entry in timeline_dir.iterdir():
+        if dir_entry.name == "metadata":
+            # Looks like a layer file. Remove it
+            dir_entry.unlink()
+            metadata_removed = True
+    assert metadata_removed, f"Failed to find metadata file in {timeline_dir}"
+
+    env.pageserver.allowed_errors.append(
+        f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
+    )
+
+    # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
+    pageserver_http.tenant_load(tenant_id=tenant_id)
+    wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5)
+
+
 # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
 # Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
 def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
@@ -733,8 +778,7 @@ def test_ignore_while_attaching(
        tenants_before_ignore
    ), "Only ignored tenant should be missing"

-    # Calling load will bring the tenant back online
-    pageserver_http.configure_failpoints([("attach-before-activate", "off")])
+    # But can load it from local files, that will restore attach.
    pageserver_http.tenant_load(tenant_id)

    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import time
 from contextlib import closing
 from datetime import datetime
@@ -19,7 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import timeline_delete_wait_completed
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
-from fixtures.types import Lsn, TenantId
+from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample

@@ -297,8 +298,13 @@ def test_pageserver_with_empty_tenants(

    client = env.pageserver.http_client()

-    tenant_with_empty_timelines = env.initial_tenant
-    timeline_delete_wait_completed(client, tenant_with_empty_timelines, env.initial_timeline)
+    tenant_with_empty_timelines = TenantId.generate()
+    client.tenant_create(tenant_with_empty_timelines)
+    temp_timelines = client.timeline_list(tenant_with_empty_timelines)
+    for temp_timeline in temp_timelines:
+        timeline_delete_wait_completed(
+            client, tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
+        )

    files_in_timelines_dir = sum(
        1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines))
@@ -311,19 +317,34 @@ def test_pageserver_with_empty_tenants(
    env.endpoints.stop_all()
    env.pageserver.stop()

+    tenant_without_timelines_dir = env.initial_tenant
+    shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir))
+
    env.pageserver.start()

    client = env.pageserver.http_client()

-    def not_attaching():
+    def not_loading():
        tenants = client.tenant_list()
-        assert len(tenants) == 1
-        assert all(t["state"]["slug"] != "Attaching" for t in tenants)
+        assert len(tenants) == 2
+        assert all(t["state"]["slug"] != "Loading" for t in tenants)

-    wait_until(10, 0.2, not_attaching)
+    wait_until(10, 0.2, not_loading)

    tenants = client.tenant_list()

+    [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
+    assert (
+        broken_tenant["state"]["slug"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
+    assert (
+        broken_tenant_status["state"]["slug"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*")
+
    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
    assert (
        loaded_tenant["state"]["slug"] == "Active"
@@ -337,6 +358,9 @@ def test_pageserver_with_empty_tenants(
    time.sleep(1)  # to allow metrics propagation

    ps_metrics = client.get_metrics()
+    broken_tenants_metric_filter = {
+        "tenant_id": str(tenant_without_timelines_dir),
+    }
    active_tenants_metric_filter = {
        "state": "Active",
    }
@@ -350,3 +374,13 @@ def test_pageserver_with_empty_tenants(
    assert (
        tenant_active_count == 1
    ), f"Tenant {tenant_with_empty_timelines} should have metric as active"
+
+    tenant_broken_count = int(
+        ps_metrics.query_one(
+            "pageserver_broken_tenants_count", filter=broken_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_broken_count == 1
+    ), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -70,7 +70,8 @@ def test_threshold_based_eviction(
    }

    # restart because changing tenant config is not instant
-    env.pageserver.restart()
+    env.pageserver.stop()
+    env.pageserver.start()

    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
        "kind": "LayerAccessThreshold",
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -277,6 +277,13 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

            if failpoint == "timeline-delete-after-index-delete":
                m = ps_http.get_metrics()
+                assert (
+                    m.query_one(
+                        "remote_storage_s3_request_seconds_count",
+                        filter={"request_type": "get_object", "result": "err"},
+                    ).value
+                    == 2  # One is missing tenant deletion mark, second is missing index part
+                )
                assert (
                    m.query_one(
                        "remote_storage_s3_request_seconds_count",
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1,3 +1,4 @@
+import filecmp
 import os
 import pathlib
 import random
@@ -400,8 +401,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 def wait(f, desc, timeout=30, wait_f=None):
    started_at = time.time()
    while True:
-        if f():
-            break
+        try:
+            if f():
+                break
+        except Exception:
+            pass
        elapsed = time.time() - started_at
        if elapsed > timeout:
            raise RuntimeError(f"timed out waiting {elapsed:.0f}s for {desc}")
@@ -984,6 +988,141 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
        endpoint.start()


+# Test that we can create timeline with one safekeeper down and initialize it
+# later when some data already had been written.
+def test_late_init(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    sk1 = env.safekeepers[0]
+    sk1.stop()
+
+    # create and insert smth while safekeeper is down...
+    env.neon_cli.create_branch("test_late_init")
+    endpoint = env.endpoints.create_start("test_late_init")
+    endpoint.safe_psql("create table t(key int, value text)")
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    log.info("insert with safekeeper down done")
+    endpoint.stop()  # stop compute
+
+    # stop another safekeeper, and start one which missed timeline creation
+    sk2 = env.safekeepers[1]
+    sk2.stop()
+    sk1.start()
+
+    # insert some more
+    endpoint = env.endpoints.create_start("test_late_init")
+    endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
+
+
+# is timeline flush_lsn equal on provided safekeepers?
+def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
+    status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
+    status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(
+        f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
+    )
+    return status1.flush_lsn == status2.flush_lsn
+
+
+# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
+# 1) walproposer can't recover node if it misses WAL written by previous computes, but
+#    still starts up and functions normally if two other sks are ok.
+# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
+#    normally if two other sks are ok.
+# 3) Lagged safekeeper can still recover by peer recovery.
+def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
+    pass
+
+
+# Smaller version of test_one_sk_down testing peer recovery in isolation: that
+# it works without compute at all.
+def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_peer_recovery")
+    endpoint = env.endpoints.create_start("test_peer_recovery")
+
+    endpoint.safe_psql("create table t(key int, value text)")
+    sk1 = env.safekeepers[0]
+    sk2 = env.safekeepers[1]
+    sk1_http_cli = sk1.http_client()
+    sk2_http_cli = sk2.http_client()
+    # ensure tli gets created on sk1, peer recovery won't do that
+    wait(
+        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        "flush_lsn to get aligned",
+    )
+
+    sk1 = env.safekeepers[0]
+    sk1.stop()
+
+    # roughly fills one segment
+    endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
+
+    endpoint.stop()  # stop compute
+
+    # now start safekeeper, but with peer recovery disabled; it should lag for about a segment
+    sk1.start(extra_opts=["--peer-recovery=false"])
+    sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
+    sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(
+        f"flush_lsns after insertion: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
+    )
+    assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
+
+    # wait a bit, lsns shouldn't change
+    # time.sleep(5)
+    sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
+    sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(
+        f"flush_lsns after waiting: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
+    )
+    assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
+
+    # now restart safekeeper with peer recovery enabled and wait for recovery
+    sk1.stop().start(extra_opts=["--peer-recovery=true"])
+    wait(
+        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        "flush_lsn to get aligned",
+    )
+
+    # check that WALs are identic after recovery
+    segs = sk1.list_segments(tenant_id, timeline_id)
+    log.info(f"segs are {segs}")
+
+    (_, mismatch, not_regular) = filecmp.cmpfiles(
+        sk1.timeline_dir(tenant_id, timeline_id),
+        sk2.timeline_dir(tenant_id, timeline_id),
+        segs,
+        shallow=False,
+    )
+    log.info(
+        f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
+    )
+
+    for f in mismatch:
+        f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
+        f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
+        stdout_filename = "{}.filediff".format(f2)
+
+        with open(stdout_filename, "w") as stdout_f:
+            subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
+            subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+
+            cmd = "diff {}.hex {}.hex".format(f1, f2)
+            subprocess.run([cmd], stdout=stdout_f, shell=True)
+
+    assert (mismatch, not_regular) == ([], [])
+
+    # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
+    env.safekeepers[2].stop()
+    endpoint = env.endpoints.create_start("test_peer_recovery")
+    endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
+
+
 class SafekeeperEnv:
    def __init__(
        self,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "e5e255d2da05bc5f884b871c042014030a114a9b",
-    "postgres-v15": "23f2d411020a739375b32895ce1362ded2962084",
-    "postgres-v14": "ebcca9e9eb49621b5b17247833b59e836337e8aa"
+    "postgres-v16": "550ffa6495a5dc62fccc3a8b449386633758680b",
+    "postgres-v15": "ab67ab96355d61e9d0218630be4aa7db53bf83e7",
+    "postgres-v14": "6669a672ee14ab2c09d44c4552f9a13fad3afc10"
 }
Author	SHA1	Message	Date
Bojan Serafimov	a8fd6266aa	wip	2023-10-27 11:42:18 -04:00
Bojan Serafimov	151605d751	wip	2023-10-24 13:11:40 -04:00
John Spray	eaaa18f6ed	attachment_service: graceful SIGQUIT (#5626 ) `attachment_service` doesn't explicitly handle signals, which causes a backtrace when `neon_local` kills it with SIGQUIT. Closes: https://github.com/neondatabase/neon/issues/5613	2023-10-23 17:30:25 +01:00
John Spray	188f67e1df	pageserver: forward compat: be tolerant of deletion marker in `timelines/` (#5632 ) ## Problem https://github.com/neondatabase/neon/pull/5580 will move the remote deletion marker into the `timelines/` path. This would cause old pageserver code to fail loading the tenant due to an apparently invalid timeline ID. That would be a problem if we had to roll back after deploying #5580 ## Summary of changes If a `deleted` file is in `timelines/` just ignore it.	2023-10-23 17:51:38 +02:00
John Spray	7e805200bb	pageserver: parallel load of configs (#5607 ) ## Problem When the number of tenants is large, sequentially issuing the open/read calls for their config files is a ~1000ms delay during startup. It's not a lot, but it's simple to fix. ## Summary of changes Put all the config loads into spawn_blocking() tasks and run them in a JoinSet. We can simplify this a bit later when we have full async disk I/O. --------- Co-authored-by: Shany Pozin <shany@neon.tech>	2023-10-23 15:32:34 +01:00
Christian Schwarz	c6ca1d76d2	consumption_metrics: fix periodicness behavior & reporting (#5625 ) Before this PR, the ticker was running at default miss behavior `Delay`. For example, here is the startup output with 25k tenants: ``` 2023-10-19T09:57:21.682466Z INFO synthetic_size_worker: starting calculate_synthetic_size_worker 2023-10-19T10:50:44.678202Z WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=3202.995707156s period=10m task=ConsumptionMetricsSyntheticSizeWorker 2023-10-19T10:52:17.408056Z WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=2695.72556035s period=10m task=ConsumptionMetricsSyntheticSizeWorker ``` The first message's `elapsed` value is correct. It matches the delta between the log line timestamps. The second one is logged ca 1.5min after, though, but reports a much larger `elapsed` than 1.5min. This PR fixes the behavior by copying what `eviction_task.rs` does.	2023-10-23 16:31:38 +02:00
Conrad Ludgate	94b4e76e13	proxy: latency connect outcome (#5588 ) ## Problem I recently updated the latency timers to include cache miss and pool miss, as well as connection protocol. By moving the latency timer to start before authentication, we count a lot more failures and it's messed up the latency dashboard. ## Summary of changes Add another label to LatencyTimer metrics for outcome. Explicitly report on success	2023-10-23 15:17:28 +01:00
khanova	b514da90cb	Set up timeout for scram protocol execution (#5551 ) ## Problem Context: https://github.com/neondatabase/neon/issues/5511#issuecomment-1759649679 Some of out scram protocol execution timed out only after 17 minutes. ## Summary of changes Make timeout for scram execution meaningful and configurable.	2023-10-23 15:11:05 +01:00
Conrad Ludgate	7d17f1719f	reduce cancel map contention (#5555 ) ## Problem Every database request locks this cancel map rwlock. At high requests per second this would have high contention ## Summary of changes Switch to dashmap which has a sharded rwlock to reduce contention	2023-10-23 14:12:41 +01:00
John Spray	41ee75bc71	pageserver: do config writes in a spawn_blocking (#5603 ) ## Problem We now persist tenant configuration every time we spawn a tenant. The persist_tenant_config function is doing a series of non-async filesystem I/O, because `crashsafe::` isn't async yet. This isn't a demonstrated problem, but is a source of uncertainty when reasoning about what's happening with our startup times. ## Summary of changes - Wrap `crashsafe_overwrite` in `spawn_blocking`. - Although I think this change makes sense, it does not have a measurable impact on load time when testing with 10k tenants. - This can be reverted when we have full async I/O	2023-10-23 09:19:01 +01:00
Christian Schwarz	11e523f503	walredo: fix EGAGAIN/"os error 11" false page reconstruction failures (#5560 ) Stacked atop https://github.com/neondatabase/neon/pull/5559 Before this PR, there was the following race condition: ``` T1: polls for writeable stdin T1: writes to stdin T1: enters poll for stdout/stderr T2: enters poll for stdin write WALREDO: writes to stderr KERNEL: wakes up T1 and T2 Tx: reads stderr and prints it Ty: reads stderr and gets EAGAIN (valid values for (x, y) are (1, 2) or (2, 1)) ``` The concrete symptom that we observed repeatedly was with PG16, which started logging `registered custom resource manager` to stderr always, during startup, thereby giving us repeated opportunity to hit above race condition. PG14 and PG15 didn't log anything to stderr, hence we could have only hit this race condition if there was an actual error happening. This PR fixes the race by moving the reading of stderr into a tokio task. It exits when the stderr is closed by the child process, which in turn happens when the child exits, either by itself or because we killed it. The downside is that the async scheduling can reorder the log messages, which can be seen in the new `test_stderr`, which runs in a single-threaded runtime. I included the output below. Overall I think we should move the entire walredo to async, as Joonas proposed many months ago. This PR's asyncification is just the first step to resolve these false page reconstruction errors. After this is fixed, we should stop printing that annoying stderr message on walredo startup; it causes noise in the pageserver logs. That work is tracked in #5399 . ``` 2023-10-13T19:05:21.878858Z ERROR apply_wal_records{tenant_id=d546fb76ba529195392fb4d19e243991 pid=753986}: failed to write out the walredo errored input: No such file or directory (os error 2) target=walredo-1697223921878-1132-0.walredo length=1132 2023-10-13T19:05:21.878932Z DEBUG postgres applied 2 WAL records (1062 bytes) in 114666 us to reconstruct page image at LSN 0/0 2023-10-13T19:05:21.878942Z ERROR error applying 2 WAL records 0/16A9388..0/16D4080 (1062 bytes) to base image with LSN 0/0 to reconstruct page image at LSN 0/0 n_attempts=0: apply_wal_records Caused by: WAL redo process closed its stdout unexpectedly 2023-10-13T19:05:21.879027Z INFO kill_and_wait_impl{pid=753986}: wait successful exit_status=signal: 11 (SIGSEGV) (core dumped) 2023-10-13T19:05:21.879079Z DEBUG wal-redo-postgres-stderr{pid=753986 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: wal-redo-postgres stderr_logger_task started 2023-10-13T19:05:21.879104Z ERROR wal-redo-postgres-stderr{pid=753986 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: received output output="2023-10-13 19:05:21.769 GMT [753986] LOG: registered custom resource manager \"neon\" with ID 134\n" 2023-10-13T19:05:21.879116Z DEBUG wal-redo-postgres-stderr{pid=753986 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: wal-redo-postgres stderr_logger_task finished 2023-10-13T19:05:22.004439Z ERROR apply_wal_records{tenant_id=d546fb76ba529195392fb4d19e243991 pid=754000}: failed to write out the walredo errored input: No such file or directory (os error 2) target=walredo-1697223922004-1132-0.walredo length=1132 2023-10-13T19:05:22.004493Z DEBUG postgres applied 2 WAL records (1062 bytes) in 125344 us to reconstruct page image at LSN 0/0 2023-10-13T19:05:22.004501Z ERROR error applying 2 WAL records 0/16A9388..0/16D4080 (1062 bytes) to base image with LSN 0/0 to reconstruct page image at LSN 0/0 n_attempts=1: apply_wal_records Caused by: WAL redo process closed its stdout unexpectedly 2023-10-13T19:05:22.004588Z INFO kill_and_wait_impl{pid=754000}: wait successful exit_status=signal: 11 (SIGSEGV) (core dumped) 2023-10-13T19:05:22.004624Z DEBUG wal-redo-postgres-stderr{pid=754000 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: wal-redo-postgres stderr_logger_task started 2023-10-13T19:05:22.004653Z ERROR wal-redo-postgres-stderr{pid=754000 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: received output output="2023-10-13 19:05:21.884 GMT [754000] LOG: registered custom resource manager \"neon\" with ID 134\n" 2023-10-13T19:05:22.004666Z DEBUG wal-redo-postgres-stderr{pid=754000 tenant_id=d546fb76ba529195392fb4d19e243991 pg_version=16}: wal-redo-postgres stderr_logger_task finished ```	2023-10-23 09:00:13 +01:00
Konstantin Knizhnik	b1a1126152	Grant replication permission to newly created users (#5615 ) ## Problem ## Summary of changes ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-10-20 21:29:17 +03:00
John Spray	a8899e1e0f	pageserver: apply timeout when waiting for tenant loads (#5601 ) ## Problem Loading tenants shouldn't hang. However, if it does, we shouldn't let one hung tenant prevent the entire process from starting background jobs. ## Summary of changes Generalize the timeout mechanism that we already applied to loading initial logical sizes: each phase in startup where we wait for a barrier is subject to a timeout, and startup will proceed if it doesn't complete within timeout. Startup metrics will still reflect the time when a phase actually completed, rather than when we skipped it. The code isn't the most beautiful, but that kind of reflects the awkwardness of await'ing on a future and then stashing it to await again later if we time out. I could imagine making this cleaner in future by waiting on a structure that doesn't self-destruct on wait() the way Barrier does, then make InitializationOrder into a structure that manages the series of waits etc.	2023-10-20 09:15:34 +01:00
Arseny Sher	2fbd5ab075	Add safekeeper test_late_init.	2023-10-20 10:57:59 +03:00
Arseny Sher	702382e99a	Add check that WAL segments are identical after recovery.	2023-10-20 10:57:59 +03:00
Arseny Sher	1b53b3e200	Make test_pageserver_http_get_wal_receiver_success not wait for keepalive.	2023-10-20 10:57:59 +03:00
Arseny Sher	b332268cec	Introduce safekeeper peer recovery. Implements fetching of WAL by safekeeper from another safekeeper by imitating behaviour of last elected leader. This allows to avoid WAL accumulation on compute and facilitates faster compute startup as it doesn't need to download any WAL. Actually removing WAL download in walproposer is a matter of another patch though. There is a per timeline task which always runs, checking regularly if it should start recovery frome someone, meaning there is something to fetch and there is no streaming compute. It then proceeds with fetching, finishing when there is nothing more to receive. Implements https://github.com/neondatabase/neon/pull/4875	2023-10-20 10:57:59 +03:00
Arseny Sher	76c702219c	Don't use AppenRequestHeader.epoch_start_lsn. It is simpler to get it once from ProposerEelected.	2023-10-20 10:57:59 +03:00
Arthur Petukhovsky	ba856140e7	Fix neon_extra_build.yml (#5605 ) Build walproposer-lib in gather-rust-build-stats, fix nproc usage, fix walproposer-lib on macos.	2023-10-19 22:20:39 +01:00
Em Sharnoff	2cf6a47cca	vm-monitor: Deny not fail downscale if no memory stats yet (#5606 ) Fixes an issue we observed on staging that happens when the autoscaler-agent attempts to immediately downscale the VM after binding, which is typical for pooled computes. The issue was occurring because the autoscaler-agent was requesting downscaling before the vm-monitor had gathered sufficient cgroup memory stats to be confident in approving it. When the vm-monitor returned an internal error instead of denying downscaling, the autoscaler-agent retried the connection and immediately hit the same issue (in part because cgroup stats are collected per-connection, rather than globally).	2023-10-19 19:09:37 +01:00
Konstantin Knizhnik	5a8bcdccb0	Fix elog format error in wallog_mapping_file (#5602 ) ## Problem Fix elog format error in wallog_mapping_file ## Summary of changes Use proper case to avoid compilation warning=error in C at MacOS. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-10-19 20:24:35 +03:00
Em Sharnoff	2c8741a5ed	vm-monitor: Log full error on message handling failure (#5604 ) There's currently an issue with the vm-monitor on staging that's not really feasible to debug because the current display impl gives no context to the errors (just says "failed to downscale"). Logging the full error should help. For communications with the autoscaler-agent, it's ok to only provide the outermost cause, because we can cross-reference with the VM logs. At some point in the future, we may want to change that.	2023-10-19 18:10:33 +02:00
Shany Pozin	893b7bac9a	Fix neon_extra_builds.yml : nproc is not supported in mac os (#5598 ) ## Problem nproc is not supported in mac os, use sysctl -n hw.ncpu instead	2023-10-19 15:24:23 +01:00
Arthur Petukhovsky	66f8f5f1c8	Call walproposer from Rust (#5403 ) Create Rust bindings for C functions from walproposer. This allows to write better tests with real walproposer code without spawning multiple processes and starting up the whole environment. `make walproposer-lib` stage was added to build static libraries `libwalproposer.a`, `libpgport.a`, `libpgcommon.a`. These libraries can be statically linked to any executable to call walproposer functions. `libs/walproposer/src/walproposer.rs` contains `test_simple_sync_safekeepers` to test that walproposer can be called from Rust to emulate sync_safekeepers logic. It can also be used as a usage example.	2023-10-19 14:17:15 +01:00
Alexander Bayandin	3a19da1066	build(deps): bump rustix from 0.37.19 to 0.37.25 (#5596 ) ## Problem @dependabot has bumped `rustix` 0.36 version to the latest in https://github.com/neondatabase/neon/pull/5591, but didn't bump 0.37. Also, update all Rust dependencies for `test_runner/pg_clients/rust/tokio-postgres`. Fixes - https://github.com/neondatabase/neon/security/dependabot/39 - https://github.com/neondatabase/neon/security/dependabot/40 ## Summary of changes - `cargo update -p rustix@0.37.19` - Update all dependencies for `test_runner/pg_clients/rust/tokio-postgres`	2023-10-19 13:49:06 +01:00
Conrad Ludgate	572eda44ee	update tokio-postgres (#5597 ) https://github.com/neondatabase/rust-postgres/pull/23	2023-10-19 14:32:19 +02:00
Arpad Müller	b1d6af5ebe	Azure blobs: Simplify error conversion by addition of to_download_error (#5575 ) There is a bunch of duplication and manual Result handling that can be simplified by moving the error conversion into a shared function, using `map_err`, and the question mark operator.	2023-10-19 14:31:09 +02:00