Merge pull request #5638 from neondatabase/releases/2023-10-24

Release 2023-10-24
Cleanup in azure_upload_download_works test (#5636 )
2026-06-19 21:30:38 +00:00 · 2023-10-24 12:10:52 +03:00 · 2023-10-23 19:08:56 +01:00 · 2023-10-23 17:30:25 +01:00 · 2023-10-23 17:51:38 +02:00 · 2023-10-23 15:32:34 +01:00
35 changed files with 1170 additions and 2517 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,3 @@ test_output/
 *.o
 *.so
 *.Po
-
-# pgindent typedef lists
-*.list
--- a/38
+++ b/38
@@ -256,44 +256,6 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

-postgres-%-pg-bsd-indent: postgres-%
-	+@echo "Compiling pg_bsd_indent"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
-
-# Create typedef list for the core. Note that generally it should be combined with
-# buildfarm one to cover platform specific stuff.
-# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
-postgres-%-typedefs.list: postgres-%
-	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
-
-# Indent postgres. See src/tools/pgindent/README for details.
-.PHONY: postgres-%-pgindent
-postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
-	+@echo merge with buildfarm typedef to cover all platforms
-	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
-		REL_16_STABLE list misses PGSemaphoreData
-	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
-	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
-		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
-	+@echo note: you might want to run it on selected files/dirs instead.
-	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
-		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
-		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
-
-# Indent pxgn/neon.
-.PHONY: pgindent
-neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
-		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
-
-
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -268,7 +269,16 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;

    Ok(())
 }
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -267,6 +267,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let buf = download_and_compare(dl).await?;
    assert_eq!(buf, data);

+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
    Ok(())
 }

--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -11,6 +11,7 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -88,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -142,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,6 +18,7 @@ use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
+use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -2614,6 +2615,7 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        let legacy_config_path = conf.tenant_config_path(tenant_id);
        let config_path = conf.tenant_location_config_path(tenant_id);
+
        Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf)
            .await
    }
@@ -2652,12 +2654,20 @@ impl Tenant {
        // Convert the config to a toml file.
        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;

-        let conf_content = conf_content.as_bytes();
-
        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
-        VirtualFile::crashsafe_overwrite(config_path, &temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_id} config to {config_path}"))?;
+
+        let tenant_id = *tenant_id;
+        let config_path = config_path.to_owned();
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.as_bytes();
+                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| format!("write tenant {tenant_id} config to {config_path}"))
+            })
+        })
+        .await??;
+
        Ok(())
    }

@@ -2679,12 +2689,21 @@ impl Tenant {
        // Convert the config to a toml file.
        conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

-        let conf_content = conf_content.as_bytes();
-
        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
-        VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_id} config to {target_config_path}"))?;
+
+        let tenant_id = *tenant_id;
+        let target_config_path = target_config_path.to_owned();
+        tokio::task::spawn_blocking(move || {
+            Handle::current().block_on(async move {
+                let conf_content = conf_content.as_bytes();
+                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
+                    .await
+                    .with_context(|| {
+                        format!("write tenant {tenant_id} config to {target_config_path}")
+                    })
+            })
+        })
+        .await??;
        Ok(())
    }

@@ -3668,17 +3687,21 @@ pub(crate) mod harness {

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();

+    pub(crate) fn setup_logging() {
+        LOG_HANDLE.get_or_init(|| {
+            logging::init(
+                logging::LogFormat::Test,
+                // enable it in case the tests exercise code paths that use
+                // debug_assert_current_span_has_tenant_and_timeline_id
+                logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+            )
+            .expect("Failed to init test logging")
+        });
+    }
+
    impl TenantHarness {
        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            LOG_HANDLE.get_or_init(|| {
-                logging::init(
-                    logging::LogFormat::Test,
-                    // enable it in case in case the tests exercise code paths that use
-                    // debug_assert_current_span_has_tenant_and_timeline_id
-                    logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-                )
-                .expect("Failed to init test logging")
-            });
+            setup_logging();

            let repo_dir = PageServerConf::test_repo_dir(test_name);
            let _ = fs::remove_dir_all(&repo_dir);
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,7 +1,7 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use camino::{Utf8Path, Utf8PathBuf};
+use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
 use std::sync::Arc;
@@ -256,83 +256,99 @@ async fn init_load_generations(
    Ok(Some(generations))
 }

+/// Given a directory discovered in the pageserver's tenants/ directory, attempt
+/// to load a tenant config from it.
+///
+/// If file is missing, return Ok(None)
+fn load_tenant_config(
+    conf: &'static PageServerConf,
+    dentry: Utf8DirEntry,
+) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
+    let tenant_dir_path = dentry.path().to_path_buf();
+    if crate::is_temporary(&tenant_dir_path) {
+        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
+        // No need to use safe_remove_tenant_dir_all because this is already
+        // a temporary path
+        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
+            error!(
+                "Failed to remove temporary directory '{}': {:?}",
+                tenant_dir_path, e
+            );
+        }
+        return Ok(None);
+    }
+
+    // This case happens if we crash during attachment before writing a config into the dir
+    let is_empty = tenant_dir_path
+        .is_empty_dir()
+        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+    if is_empty {
+        info!("removing empty tenant directory {tenant_dir_path:?}");
+        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
+            error!(
+                "Failed to remove empty tenant directory '{}': {e:#}",
+                tenant_dir_path
+            )
+        }
+        return Ok(None);
+    }
+
+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
+    let tenant_id = match tenant_dir_path
+        .file_name()
+        .unwrap_or_default()
+        .parse::<TenantId>()
+    {
+        Ok(id) => id,
+        Err(_) => {
+            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
+            return Ok(None);
+        }
+    };
+
+    Ok(Some((
+        tenant_id,
+        Tenant::load_tenant_config(conf, &tenant_id),
+    )))
+}
+
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
 /// and load configurations for the tenants we found.
+///
+/// Do this in parallel, because we expect 10k+ tenants, so serial execution can take
+/// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
    let tenants_dir = conf.tenants_path();

-    let mut dir_entries = tenants_dir
-        .read_dir_utf8()
-        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
+        let dir_entries = tenants_dir
+            .read_dir_utf8()
+            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+
+        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+    })
+    .await??;

    let mut configs = HashMap::new();

-    loop {
-        match dir_entries.next() {
-            None => break,
-            Some(Ok(dentry)) => {
-                let tenant_dir_path = dentry.path().to_path_buf();
-                if crate::is_temporary(&tenant_dir_path) {
-                    info!("Found temporary tenant directory, removing: {tenant_dir_path}");
-                    // No need to use safe_remove_tenant_dir_all because this is already
-                    // a temporary path
-                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove temporary directory '{}': {:?}",
-                            tenant_dir_path, e
-                        );
-                    }
-                    continue;
-                }
+    let mut join_set = JoinSet::new();
+    for dentry in dentries {
+        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+    }

-                // This case happens if we:
-                // * crash during attach before creating the attach marker file
-                // * crash during tenant delete before removing tenant directory
-                let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
-                    format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
-                })?;
-                if is_empty {
-                    info!("removing empty tenant directory {tenant_dir_path:?}");
-                    if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
-                        error!(
-                            "Failed to remove empty tenant directory '{}': {e:#}",
-                            tenant_dir_path
-                        )
-                    }
-                    continue;
-                }
-
-                let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-                if tenant_ignore_mark_file.exists() {
-                    info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-                    continue;
-                }
-
-                let tenant_id = match tenant_dir_path
-                    .file_name()
-                    .unwrap_or_default()
-                    .parse::<TenantId>()
-                {
-                    Ok(id) => id,
-                    Err(_) => {
-                        warn!(
-                            "Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
-                        );
-                        continue;
-                    }
-                };
-
-                configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
-            }
-            Some(Err(e)) => {
-                // An error listing the top level directory indicates serious problem
-                // with local filesystem: we will fail to load, and fail to start.
-                anyhow::bail!(e);
-            }
+    while let Some(r) = join_set.join_next().await {
+        if let Some((tenant_id, tenant_config)) = r?? {
+            configs.insert(tenant_id, tenant_config);
        }
    }
+
    Ok(configs)
 }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,7 +18,7 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
+use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
@@ -190,6 +190,12 @@ pub async fn list_remote_timelines(
    let mut timeline_ids = HashSet::new();

    for timeline_remote_storage_key in timelines {
+        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
+            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
+            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
+            continue;
+        }
+
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -27,13 +27,14 @@ use std::collections::VecDeque;
 use std::io;
 use std::io::prelude::*;
 use std::ops::{Deref, DerefMut};
-use std::os::unix::io::{AsRawFd, RawFd};
+use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
-use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
+use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

@@ -47,7 +48,6 @@ use crate::metrics::{
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
-use crate::task_mgr::BACKGROUND_RUNTIME;
 use crate::walrecord::NeonWalRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
@@ -72,8 +72,6 @@ pub(crate) struct BufferTag {

 struct ProcessInput {
    stdin: ChildStdin,
-    stderr_fd: RawFd,
-    stdout_fd: RawFd,
    n_requests: usize,
 }

@@ -121,6 +119,7 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
+    /// CANCEL SAFETY: NOT CANCEL SAFE.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -153,6 +152,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
+                    .await
                };
                img = Some(result?);

@@ -173,6 +173,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
+            .await
        }
    }
 }
@@ -194,7 +195,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -283,19 +284,20 @@ impl PostgresRedoManager {
                );
                // Avoid concurrent callers hitting the same issue.
                // We can't prevent it from happening because we want to enable parallelism.
-                let mut guard = self.redo_process.write().unwrap();
-                match &*guard {
-                    Some(current_field_value) => {
-                        if Arc::ptr_eq(current_field_value, &proc) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            *guard = None;
+                {
+                    let mut guard = self.redo_process.write().unwrap();
+                    match &*guard {
+                        Some(current_field_value) => {
+                            if Arc::ptr_eq(current_field_value, &proc) {
+                                // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                *guard = None;
+                            }
+                        }
+                        None => {
+                            // Another thread was faster to observe the error, and already took the process out of rotation.
                        }
                    }
-                    None => {
-                        // Another thread was faster to observe the error, and already took the process out of rotation.
-                    }
                }
-                drop(guard);
                // NB: there may still be other concurrent threads using `proc`.
                // The last one will send SIGKILL when the underlying Arc reaches refcount 0.
                // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
@@ -308,7 +310,12 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
+                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
+                wait_done
+                    .wait_for(|v| *v)
+                    .await
+                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -619,7 +626,8 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
-    stderr: Mutex<ChildStderr>,
+    stderr_logger_cancel: CancellationToken,
+    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -668,7 +676,6 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
-
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -682,16 +689,73 @@ impl WalRedoProcess {
        set_nonblock_or_log_err!(stdout)?;
        set_nonblock_or_log_err!(stderr)?;

+        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
+
        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

+        let stderr_logger_cancel = CancellationToken::new();
+        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
+            tokio::sync::watch::channel(false);
+        tokio::spawn({
+            let stderr_logger_cancel = stderr_logger_cancel.clone();
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    let _ = stderr_logger_task_done_tx.send(true);
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                loop {
+                    // NB: we purposefully don't do a select! for the cancellation here.
+                    // The cancellation would likely cause us to miss stderr messages.
+                    // We can rely on this to return from .await because when we SIGKILL
+                    // the child, the writing end of the stderr pipe gets closed.
+                    match stderr.readable_mut().await {
+                        Ok(mut guard) => {
+                            let mut errbuf = [0; 16384];
+                            let res = guard.try_io(|fd| {
+                                use std::io::Read;
+                                fd.get_mut().read(&mut errbuf)
+                            });
+                            match res {
+                                Ok(Ok(0)) => {
+                                    // it closed the stderr pipe
+                                    break;
+                                }
+                                Ok(Ok(n)) => {
+                                    // The message might not be split correctly into lines here. But this is
+                                    // good enough, the important thing is to get the message to the log.
+                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
+                                    error!(output, "received output");
+                                },
+                                Ok(Err(e)) => {
+                                    error!(error = ?e, "read() error, waiting for cancellation");
+                                    stderr_logger_cancel.cancelled().await;
+                                    error!(error = ?e, "read() error, cancellation complete");
+                                    break;
+                                }
+                                Err(e) => {
+                                    let _e: tokio::io::unix::TryIoError = e;
+                                    // the read() returned WouldBlock, that's expected
+                                }
+                            }
+                        }
+                        Err(e) => {
+                            error!(error = ?e, "read() error, waiting for cancellation");
+                            stderr_logger_cancel.cancelled().await;
+                            error!(error = ?e, "read() error, cancellation complete");
+                            break;
+                        }
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+        });
+
        Ok(Self {
            conf,
            tenant_id,
            child: Some(child),
            stdin: Mutex::new(ProcessInput {
-                stdout_fd: stdout.as_raw_fd(),
-                stderr_fd: stderr.as_raw_fd(),
                stdin,
                n_requests: 0,
            }),
@@ -700,7 +764,8 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
-            stderr: Mutex::new(stderr),
+            stderr_logger_cancel,
+            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -774,19 +839,11 @@ impl WalRedoProcess {
        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
        let mut nwrite = 0usize;

-        // Prepare for calling poll()
-        let mut pollfds = [
-            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
-            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
-            PollFd::new(proc.stdout_fd, PollFlags::POLLIN),
-        ];
+        let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];

-        // We do two things simultaneously: send the old base image and WAL records to
-        // the child process's stdin and forward any logging
-        // information that the child writes to its stderr to the page server's log.
        while nwrite < writebuf.len() {
            let n = loop {
-                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
                    Err(nix::errno::Errno::EINTR) => continue,
                    res => break res,
                }
@@ -796,31 +853,8 @@ impl WalRedoProcess {
                anyhow::bail!("WAL redo timed out");
            }

-            // If we have some messages in stderr, forward them to the log.
-            let err_revents = pollfds[1].revents().unwrap();
-            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                let mut errbuf: [u8; 16384] = [0; 16384];
-                let mut stderr = self.stderr.lock().unwrap();
-                let len = stderr.read(&mut errbuf)?;
-
-                // The message might not be split correctly into lines here. But this is
-                // good enough, the important thing is to get the message to the log.
-                if len > 0 {
-                    error!(
-                        "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..len])
-                    );
-
-                    // To make sure we capture all log from the process if it fails, keep
-                    // reading from the stderr, before checking the stdout.
-                    continue;
-                }
-            } else if err_revents.contains(PollFlags::POLLHUP) {
-                anyhow::bail!("WAL redo process closed its stderr unexpectedly");
-            }
-
            // If 'stdin' is writeable, do write.
-            let in_revents = pollfds[0].revents().unwrap();
+            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
            } else if in_revents.contains(PollFlags::POLLHUP) {
@@ -845,6 +879,7 @@ impl WalRedoProcess {
        // advancing processed responses number.

        let mut output = self.stdout.lock().unwrap();
+        let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
        let n_processed_responses = output.n_processed_responses;
        while n_processed_responses + output.pending_responses.len() <= request_no {
            // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -855,7 +890,10 @@ impl WalRedoProcess {
                // We do two things simultaneously: reading response from stdout
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
-                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
                        Err(nix::errno::Errno::EINTR) => continue,
                        res => break res,
                    }
@@ -865,31 +903,8 @@ impl WalRedoProcess {
                    anyhow::bail!("WAL redo timed out");
                }

-                // If we have some messages in stderr, forward them to the log.
-                let err_revents = pollfds[1].revents().unwrap();
-                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    let mut errbuf: [u8; 16384] = [0; 16384];
-                    let mut stderr = self.stderr.lock().unwrap();
-                    let len = stderr.read(&mut errbuf)?;
-
-                    // The message might not be split correctly into lines here. But this is
-                    // good enough, the important thing is to get the message to the log.
-                    if len > 0 {
-                        error!(
-                            "wal-redo-postgres: {}",
-                            String::from_utf8_lossy(&errbuf[0..len])
-                        );
-
-                        // To make sure we capture all log from the process if it fails, keep
-                        // reading from the stderr, before checking the stdout.
-                        continue;
-                    }
-                } else if err_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stderr unexpectedly");
-                }
-
                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = pollfds[2].revents().unwrap();
+                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
                } else if out_revents.contains(PollFlags::POLLHUP) {
@@ -985,6 +1000,8 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait();
+        self.stderr_logger_cancel.cancel();
+        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }

@@ -1066,7 +1083,7 @@ impl Drop for NoLeakChild {
        // Offload the kill+wait of the child process into the background.
        // If someone stops the runtime, we'll leak the child process.
        // We can ignore that case because we only stop the runtime on pageserver exit.
-        BACKGROUND_RUNTIME.spawn(async move {
+        tokio::runtime::Handle::current().spawn(async move {
            tokio::task::spawn_blocking(move || {
                // Intentionally don't inherit the tracing context from whoever is dropping us.
                // This thread here is going to outlive of our dropper.
@@ -1199,6 +1216,22 @@ mod tests {
        assert_eq!(page, crate::ZERO_PAGE);
    }

+    #[tokio::test]
+    async fn test_stderr() {
+        let h = RedoHarness::new().unwrap();
+        h
+            .manager
+            .request_redo(
+                Key::from_i128(0),
+                Lsn::INVALID,
+                None,
+                short_records(),
+                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+            )
+            .await
+            .unwrap_err();
+    }
+
    #[allow(clippy::octal_escapes)]
    fn short_records() -> Vec<(Lsn, NeonWalRecord)> {
        vec![
@@ -1227,6 +1260,8 @@ mod tests {

    impl RedoHarness {
        fn new() -> anyhow::Result<Self> {
+            crate::tenant::harness::setup_logging();
+
            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,7 +9,6 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
-	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
@@ -42,17 +41,6 @@ libwalproposer.a: $(WALPROP_OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^

-# needs vars:
-# FIND_TYPEDEF pointing to find_typedef
-# INDENT pointing to pg_bsd_indent
-# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
-#   pgindent will pick it up as pg_bsd_indent path).
-.PHONY: pgindent
-pgindent:
-	+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
-	$(FIND_TYPEDEF) . > neon.typedefs
-	INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
-
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;

 /* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
+static CURL * CurlHandle;
 static struct curl_slist *ContentHeader = NULL;

 /*
@@ -54,7 +54,7 @@ typedef enum
 {
 	Op_Set,						/* An upsert: Either a creation or an alter */
 	Op_Delete,
-} OpType;
+}			OpType;

 typedef struct
 {
@@ -62,7 +62,7 @@ typedef struct
 	Oid			owner;
 	char		old_name[NAMEDATALEN];
 	OpType		type;
-} DbEntry;
+}			DbEntry;

 typedef struct
 {
@@ -70,7 +70,7 @@ typedef struct
 	char		old_name[NAMEDATALEN];
 	const char *password;
 	OpType		type;
-} RoleEntry;
+}			RoleEntry;

 /*
 * We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
 	struct DdlHashTable *prev_table;
 	HTAB	   *db_table;
 	HTAB	   *role_table;
-} DdlHashTable;
+}			DdlHashTable;

 static DdlHashTable RootTable;
-static DdlHashTable *CurrentDdlTable = &RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;

 static void
 PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
 {
 	char		str[ERROR_SIZE];
 	size_t		size;
-} ErrorString;
+}			ErrorString;

 static size_t
 ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -25,80 +25,79 @@

 #include <curl/curl.h>

-static int	extension_server_port = 0;
+static int extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;

-/*  to download all SQL (and data) files for an extension: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis */
-/*  it covers two possible extension files layouts: */
-/*  1. extension_name--version--platform.sql */
-/*  2. extension_name/extension_name--version.sql */
-/*     extension_name/extra_files.csv */
-/*  */
-/*  to download specific library file: */
-/*  curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
-	CURLcode	res;
-	char	   *compute_ctl_url;
-	char	   *postdata;
-	bool		ret = false;
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;

-	if ((curl = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }

-	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-							   extension_server_port, filename, is_library ? "?is_library=true" : "");
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");

-	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);

-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);

-	if (curl)
-	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }

-		/* always cleanup */
-		curl_easy_cleanup(curl);
-	}
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }

-	return ret;
+    return ret;
 }

-void
-pg_init_extension_server()
+void pg_init_extension_server()
 {
-	/* Port to connect to compute_ctl on localhost */
-	/* to request extension files. */
-	DefineCustomIntVariable("neon.extension_server_port",
-							"connection string to the compute_ctl",
-							NULL,
-							&extension_server_port,
-							0, 0, INT_MAX,
-							PGC_POSTMASTER,
-							0,	/* no flags required */
-							NULL, NULL, NULL);
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);

-	/* set download_extension_file_hook */
-	prev_download_extension_file_hook = download_extension_file_hook;
-	download_extension_file_hook = neon_download_extension_file_http;
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
 }
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -67,33 +67,31 @@ typedef struct FileCacheEntry
 	BufferTag	key;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
-	dlist_node	lru_node;		/* LRU list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK/32];
+	dlist_node	lru_node; /* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
 {
-	uint64		generation;		/* generation is needed to handle correct hash
-								 * reenabling */
-	uint32		size;			/* size of cache file in chunks */
-	uint32		used;			/* number of used chunks */
-	dlist_head	lru;			/* double linked list for LRU replacement
-								 * algorithm */
+	uint64 generation; /* generation is needed to handle correct hash reenabling */
+	uint32 size; /* size of cache file in chunks */
+	uint32 used; /* number of used chunks */
+	dlist_head lru; /* double linked list for LRU replacement algorithm */
 } FileCacheControl;

-static HTAB *lfc_hash;
-static int	lfc_desc = 0;
+static HTAB* lfc_hash;
+static int   lfc_desc = 0;
 static LWLockId lfc_lock;
-static int	lfc_max_size;
-static int	lfc_size_limit;
-static char *lfc_path;
-static FileCacheControl *lfc_ctl;
+static int   lfc_max_size;
+static int   lfc_size_limit;
+static char* lfc_path;
+static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif

-void		FileCacheMonitorMain(Datum main_arg);
+void FileCacheMonitorMain(Datum main_arg);

 /*
 * Local file cache is mandatory and Neon can work without it.
@@ -102,10 +100,10 @@ void		FileCacheMonitorMain(Datum main_arg);
 * All cache content should be invalidated to avoid reading of stale or corrupted data
 */
 static void
-lfc_disable(char const *op)
+lfc_disable(char const* op)
 {
 	HASH_SEQ_STATUS status;
-	FileCacheEntry *entry;
+	FileCacheEntry* entry;

 	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);

@@ -139,10 +137,9 @@ lfc_ensure_opened(void)
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);

-		if (lfc_desc < 0)
-		{
+		if (lfc_desc < 0) {
 			lfc_disable("open");
 			return false;
 		}
@@ -153,7 +150,7 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
-	bool		found;
+	bool found;
 	static HASHCTL info;

 	if (prev_shmem_startup_hook)
@@ -163,21 +160,16 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
 	{
-		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
-
-		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
+		uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);
 		lfc_hash = ShmemInitHash("lfc_hash",
-
-		/*
-		 * lfc_size+1 because we add new element to hash table before eviction
-		 * of victim
-		 */
-								 lfc_size + 1, lfc_size + 1,
+								 /* lfc_size+1 because we add new element to hash table before eviction of victim */
+								 lfc_size+1, lfc_size+1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -186,7 +178,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);

 		/* Remove file cache on restart */
-		(void) unlink(lfc_path);
+		(void)unlink(lfc_path);
 	}
 	LWLockRelease(AddinShmemInitLock);
 }
@@ -199,7 +191,7 @@ lfc_shmem_request(void)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
+	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
 	RequestNamedLWLockTranche("lfc_lock", 1);
 }

@@ -217,14 +209,11 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 static void
 lfc_change_limit_hook(int newval, void *extra)
 {
-	uint32		new_size = SIZE_MB_TO_CHUNKS(newval);
-
+	uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
 	/*
-	 * Stats collector detach shared memory, so we should not try to access
-	 * shared memory here. Parallel workers first assign default value (0), so
-	 * not perform truncation in parallel workers. The Postmaster can handle
-	 * SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
-	 * but has no PGPROC.
+	 * Stats collector detach shared memory, so we should not try to access shared memory here.
+	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
 	if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
 		return;
@@ -232,9 +221,8 @@ lfc_change_limit_hook(int newval, void *extra)
 	/* Open cache file if not done yet */
 	if (lfc_desc <= 0)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
-		if (lfc_desc < 0)
-		{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+		if (lfc_desc < 0) {
 			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 			return;
@@ -243,15 +231,11 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 	while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
 	{
-		/*
-		 * Shrink cache by throwing away least recently accessed chunks and
-		 * returning their space to file system
-		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+		/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
+		FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
 			elog(LOG, "Failed to punch hole in file: %m");
 #endif
 		hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
@@ -275,7 +259,7 @@ lfc_init(void)
 							"Maximal size of Neon local file cache",
 							NULL,
 							&lfc_max_size,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
@@ -288,7 +272,7 @@ lfc_init(void)
 							"Current limit for size of Neon local file cache",
 							NULL,
 							&lfc_size_limit,
-							0,	/* disabled by default */
+							0, /* disabled by default */
 							0,
 							INT_MAX,
 							PGC_SIGHUP,
@@ -328,18 +312,18 @@ lfc_init(void)
 bool
 lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		found;
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool found;
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -355,13 +339,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 void
 lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

 	CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -389,10 +373,9 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	 */
 	if (entry->bitmap[chunk_offs >> 5] == 0)
 	{
-		bool		has_remaining_pages;
+		bool has_remaining_pages;

-		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
-		{
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
 			if (entry->bitmap[i] != 0)
 			{
 				has_remaining_pages = true;
@@ -401,8 +384,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		}

 		/*
-		 * Put the entry at the position that is first to be reclaimed when we
-		 * have no cached pages remaining in the chunk
+		 * Put the entry at the position that is first to be reclaimed when
+		 * we have no cached pages remaining in the chunk
 		 */
 		if (!has_remaining_pages)
 		{
@@ -428,16 +411,16 @@ bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	bool		result = true;
-	uint32		hash;
-	uint64		generation;
-	uint32		entry_offset;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	bool result = true;
+	uint32 hash;
+	uint64 generation;
+	uint32 entry_offset;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

 	if (!lfc_ensure_opened())
@@ -445,7 +428,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -464,7 +447,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	LWLockRelease(lfc_lock);

-	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		lfc_disable("read");
@@ -492,31 +475,31 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
-			lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+		  char *buffer)
 #else
-					  const void *buffer)
+		  const void *buffer)
 #endif
 {
-	BufferTag	tag;
-	FileCacheEntry *entry;
-	ssize_t		rc;
-	bool		found;
-	int			chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-	uint32		hash;
+	BufferTag tag;
+	FileCacheEntry* entry;
+	ssize_t rc;
+	bool found;
+	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
+	uint32 hash;

-	if (lfc_size_limit == 0)	/* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

 	if (!lfc_ensure_opened())
 		return;

 	tag.forkNum = forkNum;
-	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
+	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	
 	CopyNRelFileInfoToBufTag(tag, rinfo);
-
+	
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -524,30 +507,24 @@ void

 	if (found)
 	{
-		/*
-		 * Unlink entry from LRU list to pin it for the duration of IO
-		 * operation
-		 */
+		/* Unlink entry from LRU list to pin it for the duration of IO operation */
 		if (entry->access_count++ == 0)
 			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
 		/*
-		 * We have two choices if all cache pages are pinned (i.e. used in IO
-		 * operations): 1. Wait until some of this operation is completed and
-		 * pages is unpinned 2. Allocate one more chunk, so that specified
-		 * cache size is more recommendation than hard limit. As far as
-		 * probability of such event (that all pages are pinned) is considered
-		 * to be very very small: there are should be very large number of
-		 * concurrent IO operations and them are limited by max_connections,
+		 * We have two choices if all cache pages are pinned (i.e. used in IO operations):
+		 * 1. Wait until some of this operation is completed and pages is unpinned
+		 * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
+		 * As far as probability of such event (that all pages are pinned) is considered to be very very small:
+		 * there are should be very large number of concurrent IO operations and them are limited by max_connections,
 		 * we prefer not to complicate code and use second approach.
 		 */
 		if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
-
+			FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
@@ -556,14 +533,13 @@ void
 		else
 		{
 			lfc_ctl->used += 1;
-			entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-												 * of file */
+			entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
 		}
 		entry->access_count = 1;
 		memset(entry->bitmap, 0, sizeof entry->bitmap);
 	}

-	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry->offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
 		LWLockRelease(lfc_lock);
@@ -625,9 +601,9 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
-		uint32		n_pages = 0;
+        HASH_SEQ_STATUS status;
+		FileCacheEntry* entry;
+		uint32 n_pages = 0;

 		funcctx = SRF_FIRSTCALL_INIT();

@@ -677,8 +653,8 @@ local_cache_pages(PG_FUNCTION_ARGS)

 		LWLockAcquire(lfc_lock, LW_SHARED);

-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
 		{
 			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
@@ -704,14 +680,14 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		 * locks, so the information of each buffer is self-consistent.
 		 */
 		n_pages = 0;
-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
+        hash_seq_init(&status, lfc_hash);
+        while ((entry = hash_seq_search(&status)) != NULL)
 		{
 			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 			{
 				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 				{
-					fctx->record[n_pages].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
 					fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
 					fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
 					fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -60,7 +60,7 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);

@@ -80,10 +80,11 @@ pageserver_connect(int elevel)
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
 	 * variable was set, use that as the password.
 	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
 	 */
 	n = 0;
 	if (neon_auth_token)
@@ -126,9 +127,9 @@ pageserver_connect(int elevel)

 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
 	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
+			  MyLatch, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
+			  NULL, NULL);
 	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);

 	while (PQisBusy(pageserver_conn))
@@ -193,7 +194,6 @@ retry:
 			if (!PQconsumeInput(pageserver_conn))
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 				neon_log(LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
@@ -234,7 +234,7 @@ pageserver_disconnect(void)
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

@@ -249,12 +249,10 @@ pageserver_send(NeonRequest *request)

 	/*
 	 * If pageserver is stopped, the connections from compute node are broken.
-	 * The compute node doesn't notice that immediately, but it will cause the
-	 * next request to fail, usually on the next query. That causes
-	 * user-visible errors if pageserver is restarted, or the tenant is moved
-	 * from one pageserver to another. See
-	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
-	 * connection in case of failure.
+	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
+	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
+	 * See https://github.com/neondatabase/neon/issues/1138
+	 * So try to reestablish connection in case of failure.
 	 */
 	if (!connected)
 	{
@@ -277,7 +275,6 @@ pageserver_send(NeonRequest *request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 		pageserver_disconnect();
 		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
@@ -335,8 +332,7 @@ pageserver_receive(void)
 		}
 		else if (rc == -2)
 		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
+			char* msg = pchomp(PQerrorMessage(pageserver_conn));
 			pageserver_disconnect();
 			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
@@ -370,7 +366,6 @@ pageserver_flush(void)
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
 			pageserver_disconnect();
 			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
@@ -473,10 +468,7 @@ pg_init_libpagestore(void)
 	neon_log(PageStoreTrace, "libpagestore already loaded");
 	page_server = &api;

-	/*
-	 * Retrieve the auth token to use when connecting to pageserver and
-	 * safekeepers
-	 */
+	/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
 	neon_auth_token = getenv("NEON_AUTH_TOKEN");
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
--- a/pgxn/neon/libpqwalproposer.h
+++ b/pgxn/neon/libpqwalproposer.h
@@ -1,96 +0,0 @@
-/*
- * Interface to set of libpq wrappers walproposer and neon_walreader need.
- * Similar to libpqwalreceiver, but it has blocking connection establishment and
- * pqexec which don't fit us. Implementation is at walproposer_pg.c.
- */
-#ifndef ___LIBPQWALPROPOSER_H__
-#define ___LIBPQWALPROPOSER_H__
-
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
-/* Possible return values from walprop_async_read */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from walprop_async_write */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
-/*
- * This header is included by walproposer.h to define walproposer_api; if we're
- * building walproposer without pg, ignore libpq part, leaving only interface
- * types.
- */
-#ifndef WALPROPOSER_LIB
-
-#include "libpq-fe.h"
-
-/*
- * Sometimes working directly with underlying PGconn is simpler, export the
- * whole thing for simplicity.
- */
-typedef struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received CopyData message from
-								 * walprop_async_read */
-} WalProposerConn;
-
-extern WalProposerConn *libpqwp_connect_start(char *conninfo);
-extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
-extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
-extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
-extern void libpqwp_disconnect(WalProposerConn *conn);
-
-#endif							/* WALPROPOSER_LIB */
-#endif							/* ___LIBPQWALPROPOSER_H__ */
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -48,9 +48,9 @@ _PG_init(void)

 	pg_init_extension_server();

-	/* Important: This must happen after other parts of the extension */
-	/* are loaded, otherwise any settings to GUCs that were set before */
-	/* the extension was loaded will be removed. */
+	// Important: This must happen after other parts of the extension
+	// are loaded, otherwise any settings to GUCs that were set before
+	// the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
 * block_id; false otherwise.
 */
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -59,7 +59,7 @@

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers

-#else							/* major version >= 16 */
+#else /* major version >= 16 */

 #define USE_RELFILELOCATOR

@@ -109,4 +109,4 @@
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

-#endif							/* //NEON_PGVERSIONCOMPAT_H */
+#endif //NEON_PGVERSIONCOMPAT_H
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -1,731 +0,0 @@
-/*
- * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
- * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
- *
- * We can't use libpqwalreceiver as it blocks during connection establishment
- * (and waiting for PQExec result), so use libpqwalproposer instead.
- *
- * TODO: keepalives are currently never sent, so the other side can close the
- * connection prematurely.
- *
- * TODO: close conn if reading takes too long to prevent stuck connections.
- */
-#include "postgres.h"
-
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
-#include "access/xlogreader.h"
-#include "libpq/pqformat.h"
-#include "storage/fd.h"
-#include "utils/wait_event.h"
-
-#include "libpq-fe.h"
-
-#include "neon_walreader.h"
-#include "walproposer.h"
-
-#define NEON_WALREADER_ERR_MSG_LEN 512
-
-/*
- * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
- */
-#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
-
-static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
-static void NeonWALReaderResetRemote(NeonWALReader *state);
-static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
-static void neon_wal_segment_close(NeonWALReader *state);
-static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
-								  TimeLineID tli);
-
-/*
- * State of connection to donor safekeeper.
- */
-typedef enum
-{
-	/* no remote connection */
-	RS_NONE,
-	/* doing PQconnectPoll, need readable socket */
-	RS_CONNECTING_READ,
-	/* doing PQconnectPoll, need writable socket */
-	RS_CONNECTING_WRITE,
-	/* Waiting for START_REPLICATION result */
-	RS_WAIT_EXEC_RESULT,
-	/* replication stream established */
-	RS_ESTABLISHED,
-} NeonWALReaderRemoteState;
-
-struct NeonWALReader
-{
-	/*
-	 * LSN before which we assume WAL is not available locally. Exists because
-	 * though first segment after startup always exists, part before
-	 * basebackup LSN is filled with zeros.
-	 */
-	XLogRecPtr	available_lsn;
-	WALSegmentContext segcxt;
-	WALOpenSegment seg;
-	int			wre_errno;
-	/* Explains failure to read, static for simplicity. */
-	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
-
-	/*
-	 * Saved info about request in progress, used to check validity of
-	 * arguments after resume and remember how far we accomplished it. req_lsn
-	 * is 0 if there is no request in progress.
-	 */
-	XLogRecPtr	req_lsn;
-	Size		req_len;
-	Size		req_progress;
-	WalProposer *wp;			/* we learn donor through walproposer */
-	char		donor_name[64]; /* saved donor safekeeper name for logging */
-	/* state of connection to safekeeper */
-	NeonWALReaderRemoteState rem_state;
-	WalProposerConn *wp_conn;
-
-	/*
-	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
-	 * NULL if there is no unprocessed message
-	 */
-	char	   *wal_ptr;
-	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
-
-	/*
-	 * LSN of wal_ptr position according to walsender to cross check against
-	 * read request
-	 */
-	XLogRecPtr	rem_lsn;
-
-	/* prepended to lines logged by neon_walreader, if provided */
-	char		log_prefix[64];
-};
-
-/* palloc and initialize NeonWALReader */
-NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
-{
-	NeonWALReader *reader;
-
-	reader = (NeonWALReader *)
-		palloc_extended(sizeof(NeonWALReader),
-						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
-	if (!reader)
-		return NULL;
-
-	reader->available_lsn = available_lsn;
-	reader->seg.ws_file = -1;
-	reader->seg.ws_segno = 0;
-	reader->seg.ws_tli = 0;
-	reader->segcxt.ws_segsize = wal_segment_size;
-
-	reader->wp = wp;
-
-	reader->rem_state = RS_NONE;
-
-	if (log_prefix)
-		strncpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
-
-	return reader;
-}
-
-void
-NeonWALReaderFree(NeonWALReader *state)
-{
-	if (state->seg.ws_file != -1)
-		neon_wal_segment_close(state);
-	if (state->wp_conn)
-		libpqwp_disconnect(state->wp_conn);
-	pfree(state);
-}
-
-/*
- * Like vanilla WALRead, but if requested position is before available_lsn or
- * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
- * advanced safekeeper.
- *
- * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
- * fetched from timeline 'tli'.
- *
- * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
- * occurs, in which case 'err' has the desciption. Error always closes remote
- * connection, if there was any, so socket subscription should be removed.
- *
- * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
- * NeonWALReaderSocket and call NeonWALRead again with exactly the same
- * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
- * docs during connection establishment (before first successful read) socket
- * underneath might change.
- *
- * Also, eventually walreader should switch from remote to local read; caller
- * should remove subscription to socket then by checking NeonWALReaderEvents
- * after successful read (otherwise next read might reopen the connection with
- * different socket).
- *
- * Reading not monotonically is not supported and will result in error.
- *
- * Caller should be sure that WAL up to requested LSN exists, otherwise
- * NEON_WALREAD_WOULDBLOCK might be always returned.
- */
-NeonWALReadResult
-NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	/*
-	 * If requested data is before known available basebackup lsn or there is
-	 * already active remote state, do remote read.
-	 */
-	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
-	{
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	if (NeonWALReadLocal(state, buf, startptr, count, tli))
-	{
-		return NEON_WALREAD_SUCCESS;
-	}
-	else if (state->wre_errno == ENOENT)
-	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	else
-	{
-		return NEON_WALREAD_ERROR;
-	}
-}
-
-/* Do the read from remote safekeeper. */
-static NeonWALReadResult
-NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	if (state->rem_state == RS_NONE)
-	{
-		XLogRecPtr	donor_lsn;
-
-		/* no connection yet; start one */
-		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
-
-		if (donor == NULL)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to establish remote connection to fetch WAL: no donor available");
-			return NEON_WALREAD_ERROR;
-		}
-		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
-		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
-				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
-		state->wp_conn = libpqwp_connect_start(donor->conninfo);
-		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to connect to %s to fetch WAL: immediately failed with %s",
-					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		/* we'll poll immediately */
-		state->rem_state = RS_CONNECTING_READ;
-	}
-
-	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
-	{
-		switch (PQconnectPoll(state->wp_conn->pg_conn))
-		{
-			case PGRES_POLLING_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "failed to connect to %s to fetch WAL: poll error: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			case PGRES_POLLING_READING:
-				state->rem_state = RS_CONNECTING_READ;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_WRITING:
-				state->rem_state = RS_CONNECTING_WRITE;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_OK:
-				{
-					/* connection successfully established */
-					char		start_repl_query[128];
-
-					snprintf(start_repl_query, sizeof(start_repl_query),
-							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
-							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
-					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
-							state->donor_name, start_repl_query);
-					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "failed to send %s query to %s: %s",
-								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-						NeonWALReaderResetRemote(state);
-						return NEON_WALREAD_ERROR;
-					}
-					state->rem_state = RS_WAIT_EXEC_RESULT;
-					break;
-				}
-
-			default:			/* there is unused PGRES_POLLING_ACTIVE */
-				Assert(false);
-				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
-		}
-	}
-
-	if (state->rem_state == RS_WAIT_EXEC_RESULT)
-	{
-		switch (libpqwp_get_query_result(state->wp_conn))
-		{
-			case WP_EXEC_SUCCESS_COPYBOTH:
-				state->rem_state = RS_ESTABLISHED;
-				break;
-			case WP_EXEC_NEEDS_INPUT:
-				return NEON_WALREAD_WOULDBLOCK;
-			case WP_EXEC_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s failed: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			default:			/* can't happen */
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s: unexpected result",
-						 state->donor_name);
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-		}
-	}
-
-	Assert(state->rem_state == RS_ESTABLISHED);
-
-	/*
-	 * If we had the request before, verify args are the same and advance the
-	 * result ptr according to the progress; otherwise register the request.
-	 */
-	if (state->req_lsn != InvalidXLogRecPtr)
-	{
-		if (state->req_lsn != startptr || state->req_len != count)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
-					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count,
-				state->req_progress);
-		buf += state->req_progress;
-	}
-	else
-	{
-		state->req_lsn = startptr;
-		state->req_len = count;
-		state->req_progress = 0;
-		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count);
-	}
-
-	while (true)
-	{
-		Size		to_copy;
-
-		/*
-		 * If we have no ready data, receive new message.
-		 */
-		if (state->wal_rem_len == 0 &&
-
-		/*
-		 * check for the sake of 0 length reads; walproposer does these for
-		 * heartbeats, though generally they shouldn't hit remote source.
-		 */
-			state->req_len - state->req_progress > 0)
-		{
-			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
-
-			if (read_msg_res != NEON_WALREAD_SUCCESS)
-				return read_msg_res;
-		}
-
-		if (state->req_lsn + state->req_progress != state->rem_lsn)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
-					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
-					 LSN_FORMAT_ARGS(state->rem_lsn),
-					 LSN_FORMAT_ARGS(state->req_lsn),
-					 state->req_len);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-
-		/* We can copy min of (available, requested) bytes. */
-		to_copy =
-			Min(state->req_len - state->req_progress, state->wal_rem_len);
-		memcpy(buf, state->wal_ptr, to_copy);
-		state->wal_ptr += to_copy;
-		state->wal_rem_len -= to_copy;
-		state->rem_lsn += to_copy;
-		if (state->wal_rem_len == 0)
-			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
-		buf += to_copy;
-		state->req_progress += to_copy;
-		if (state->req_progress == state->req_len)
-		{
-			XLogSegNo	next_segno;
-			XLogSegNo	req_segno;
-
-			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
-			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
-
-			/*
-			 * Request completed. If there is a chance of serving next one
-			 * locally, close the connection.
-			 */
-			if (state->req_lsn < state->available_lsn &&
-				state->rem_lsn >= state->available_lsn)
-			{
-				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
-						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
-			         is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
-			{
-				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
-						LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			state->req_lsn = InvalidXLogRecPtr;
-			state->req_len = 0;
-			state->req_progress = 0;
-			return NEON_WALREAD_SUCCESS;
-		}
-	}
-}
-
-/*
- * Read one WAL message from the stream, sets state->wal_ptr in case of success.
- * Resets remote state in case of failure.
- */
-static NeonWALReadResult
-NeonWALReaderReadMsg(NeonWALReader *state)
-{
-	while (true)				/* loop until we get 'w' */
-	{
-		char	   *copydata_ptr;
-		int			copydata_size;
-		StringInfoData s;
-		char		msg_type;
-		int			hdrlen;
-
-		Assert(state->rem_state == RS_ESTABLISHED);
-		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
-
-		switch (libpqwp_async_read(state->wp_conn,
-								   &copydata_ptr,
-								   &copydata_size))
-		{
-			case PG_ASYNC_READ_SUCCESS:
-				break;
-			case PG_ASYNC_READ_TRY_AGAIN:
-				return NEON_WALREAD_WOULDBLOCK;
-			case PG_ASYNC_READ_FAIL:
-				snprintf(state->err_msg,
-						 sizeof(state->err_msg),
-						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
-						 LSN_FORMAT_ARGS(state->req_lsn),
-						 state->req_len,
-						 state->req_progress,
-						 PQerrorMessage(state->wp_conn->pg_conn));
-				goto err;
-		}
-
-		/* put data on StringInfo to parse */
-		s.data = copydata_ptr;
-		s.len = copydata_size;
-		s.cursor = 0;
-		s.maxlen = -1;
-
-		if (copydata_size == 0)
-		{
-			snprintf(state->err_msg,
-					 sizeof(state->err_msg),
-					 "zero length copydata received");
-			goto err;
-		}
-		msg_type = pq_getmsgbyte(&s);
-		switch (msg_type)
-		{
-			case 'w':
-				{
-					XLogRecPtr	start_lsn;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg,
-								 sizeof(state->err_msg),
-								 "invalid WAL message received from primary");
-						goto err;
-					}
-
-					start_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
-					pq_getmsgint64(&s); /* TimestampTz send_time */
-
-					state->rem_lsn = start_lsn;
-					state->wal_rem_len = (Size) (s.len - s.cursor);
-					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
-					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
-							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
-
-					return NEON_WALREAD_SUCCESS;
-				}
-			case 'k':
-				{
-					XLogRecPtr	end_lsn;
-					bool		reply_requested;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "invalid keepalive message received from primary");
-						goto err;
-					}
-
-					end_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* TimestampTz timestamp; */
-					reply_requested = pq_getmsgbyte(&s);
-					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
-							LSN_FORMAT_ARGS(end_lsn),
-							reply_requested);
-					if (end_lsn < state->req_lsn + state->req_len)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
-								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
-						goto err;
-					}
-					continue;
-				}
-			default:
-				nwr_log(WARNING, "invalid replication message type %d", msg_type);
-				continue;
-		}
-	}
-err:
-	NeonWALReaderResetRemote(state);
-	return NEON_WALREAD_ERROR;
-}
-
-/* reset remote connection and request in progress */
-static void
-NeonWALReaderResetRemote(NeonWALReader *state)
-{
-	state->req_lsn = InvalidXLogRecPtr;
-	state->req_len = 0;
-	state->req_progress = 0;
-	state->rem_state = RS_NONE;
-	if (state->wp_conn)
-	{
-		libpqwp_disconnect(state->wp_conn);
-		state->wp_conn = NULL;
-	}
-	state->donor_name[0] = '\0';
-	state->wal_ptr = NULL;
-	state->wal_rem_len = 0;
-	state->rem_lsn = InvalidXLogRecPtr;
-}
-
-/*
- * Return socket of connection to remote source. Must be called only when
- * connection exists (NeonWALReaderEvents returns non zero).
- */
-pgsocket
-NeonWALReaderSocket(NeonWALReader *state)
-{
-	if (!state->wp_conn)
-		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
-	return PQsocket(state->wp_conn->pg_conn);
-}
-
-/*
- * Returns events user should wait on connection socket or 0 if remote
- * connection is not active.
- */
-extern uint32
-NeonWALReaderEvents(NeonWALReader *state)
-{
-	switch (state->rem_state)
-	{
-		case RS_NONE:
-			return 0;
-		case RS_CONNECTING_READ:
-			return WL_SOCKET_READABLE;
-		case RS_CONNECTING_WRITE:
-			return WL_SOCKET_WRITEABLE;
-		case RS_WAIT_EXEC_RESULT:
-		case RS_ESTABLISHED:
-			return WL_SOCKET_READABLE;
-		default:
-			Assert(false);
-			return 0;			/* make compiler happy */
-	}
-}
-
-static bool
-NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	char	   *p;
-	XLogRecPtr	recptr;
-	Size		nbytes;
-
-	p = buf;
-	recptr = startptr;
-	nbytes = count;
-
-	while (nbytes > 0)
-	{
-		uint32		startoff;
-		int			segbytes;
-		int			readbytes;
-
-		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
-
-		/*
-		 * If the data we want is not in a segment we have open, close what we
-		 * have (if anything) and open the next one, using the caller's
-		 * provided openSegment callback.
-		 */
-		if (state->seg.ws_file < 0 ||
-			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
-			tli != state->seg.ws_tli)
-		{
-			XLogSegNo	nextSegNo;
-
-			neon_wal_segment_close(state);
-
-			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
-			if (!neon_wal_segment_open(state, nextSegNo, &tli))
-			{
-				char		fname[MAXFNAMELEN];
-
-				state->wre_errno = errno;
-
-				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
-				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
-						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
-				return false;
-			}
-
-			/* This shouldn't happen -- indicates a bug in segment_open */
-			Assert(state->seg.ws_file >= 0);
-
-			/* Update the current segment info. */
-			state->seg.ws_tli = tli;
-			state->seg.ws_segno = nextSegNo;
-		}
-
-		/* How many bytes are within this segment? */
-		if (nbytes > (state->segcxt.ws_segsize - startoff))
-			segbytes = state->segcxt.ws_segsize - startoff;
-		else
-			segbytes = nbytes;
-
-#ifndef FRONTEND
-		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-#endif
-
-		/* Reset errno first; eases reporting non-errno-affecting errors */
-		errno = 0;
-		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
-
-#ifndef FRONTEND
-		pgstat_report_wait_end();
-#endif
-
-		if (readbytes <= 0)
-		{
-			char		fname[MAXFNAMELEN];
-
-			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
-
-			if (readbytes < 0)
-			{
-				state->wre_errno = errno;
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
-						 fname, startoff, strerror(state->wre_errno));
-			}
-			else
-			{
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
-						 fname, startoff);
-			}
-			return false;
-		}
-
-		/* Update state for read */
-		recptr += readbytes;
-		nbytes -= readbytes;
-		p += readbytes;
-	}
-
-	return true;
-}
-
-/*
- * Copy of vanilla wal_segment_open, but returns false in case of error instead
- * of ERROR, with errno set.
- *
- * XLogReaderRoutine->segment_open callback for local pg_wal files
- */
-static bool
-neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
-					  TimeLineID *tli_p)
-{
-	TimeLineID	tli = *tli_p;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
-	nwr_log(LOG, "opening %s", path);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return true;
-
-	return false;
-}
-
-static bool
-is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
-{
-	struct stat stat_buffer;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, segno, segsize);
-	return stat(path, &stat_buffer) == 0;
-}
-
-/* copy of vanilla wal_segment_close with NeonWALReader */
-static void
-neon_wal_segment_close(NeonWALReader *state)
-{
-	if (state->seg.ws_file >= 0)
-	{
-		close(state->seg.ws_file);
-		/* need to check errno? */
-		state->seg.ws_file = -1;
-	}
-}
-
-char *
-NeonWALReaderErrMsg(NeonWALReader *state)
-{
-	return state->err_msg;
-}
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -1,29 +0,0 @@
-#ifndef __NEON_WALREADER_H__
-#define __NEON_WALREADER_H__
-
-#include "access/xlogdefs.h"
-
-/* forward declare so we don't have to expose the struct to the public */
-struct NeonWALReader;
-typedef struct NeonWALReader NeonWALReader;
-
-/* avoid including walproposer.h as it includes us */
-struct WalProposer;
-typedef struct WalProposer WalProposer;
-
-/* NeonWALRead return value */
-typedef enum
-{
-	NEON_WALREAD_SUCCESS,
-	NEON_WALREAD_WOULDBLOCK,
-	NEON_WALREAD_ERROR,
-} NeonWALReadResult;
-
-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
-extern void NeonWALReaderFree(NeonWALReader *state);
-extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
-extern uint32 NeonWALReaderEvents(NeonWALReader *state);
-extern char *NeonWALReaderErrMsg(NeonWALReader *state);
-
-#endif							/* __NEON_WALREADER_H__ */
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -40,13 +40,13 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
-} NeonMessageTag;
+}			NeonMessageTag;

 /* base struct for c-style inheritance */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonMessage;
+}			NeonMessage;

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

@@ -67,27 +67,27 @@ typedef struct
 	NeonMessageTag tag;
 	bool		latest;			/* if true, request latest page version */
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} NeonRequest;
+}			NeonRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonExistsRequest;
+}			NeonExistsRequest;

 typedef struct
 {
 	NeonRequest req;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-} NeonNblocksRequest;
+}			NeonNblocksRequest;

 typedef struct
 {
 	NeonRequest req;
 	Oid			dbNode;
-} NeonDbSizeRequest;
+}			NeonDbSizeRequest;

 typedef struct
 {
@@ -95,31 +95,31 @@ typedef struct
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-} NeonGetPageRequest;
+}			NeonGetPageRequest;

 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
 	NeonMessageTag tag;
-} NeonResponse;
+}			NeonResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	bool		exists;
-} NeonExistsResponse;
+}			NeonExistsResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	uint32		n_blocks;
-} NeonNblocksResponse;
+}			NeonNblocksResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		page[FLEXIBLE_ARRAY_MEMBER];
-} NeonGetPageResponse;
+}			NeonGetPageResponse;

 #define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))

@@ -127,18 +127,18 @@ typedef struct
 {
 	NeonMessageTag tag;
 	int64		db_size;
-} NeonDbSizeResponse;
+}			NeonDbSizeResponse;

 typedef struct
 {
 	NeonMessageTag tag;
 	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
 												 * message */
-} NeonErrorResponse;
+}			NeonErrorResponse;

-extern StringInfoData nm_pack_request(NeonRequest *msg);
-extern NeonResponse *nm_unpack_response(StringInfo s);
-extern char *nm_to_string(NeonMessage *msg);
+extern StringInfoData nm_pack_request(NeonRequest * msg);
+extern NeonResponse * nm_unpack_response(StringInfo s);
+extern char *nm_to_string(NeonMessage * msg);

 /*
 * API
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage *msg);

 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
+	bool		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
 	bool		(*flush) (void);
-} page_server_api;
+}			page_server_api;

 extern void prefetch_on_ps_disconnect(void);

-extern page_server_api *page_server;
+extern page_server_api * page_server;

 extern char *page_server_connstring;
-extern int	flush_every_n_requests;
-extern int	readahead_buffer_size;
+extern int flush_every_n_requests;
+extern int readahead_buffer_size;
 extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
+extern int seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern bool wal_redo;
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, char *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -100,21 +100,21 @@ typedef enum
 	UNLOGGED_BUILD_PHASE_1,
 	UNLOGGED_BUILD_PHASE_2,
 	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
+}			UnloggedBuildPhase;

 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * Prefetch implementation:
- *
+ * 
 * Prefetch is performed locally by each backend.
 *
 * There can be up to readahead_buffer_size active IO requests registered at
 * any time. Requests using smgr_prefetch are sent to the pageserver, but we
 * don't wait on the response. Requests using smgr_read are either read from
 * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages.
+ * this also will allow us to receive other prefetched pages. 
 * Each request is immediately written to the output buffer of the pageserver
 * connection, but may not be flushed if smgr_prefetch is used: pageserver
 * flushes sent requests on manual flush, or every neon.flush_output_after
@@ -138,7 +138,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 /*
 * State machine:
- *
+ *        
 * not in hash : in hash
 *             :
 * UNUSED ------> REQUESTED --> RECEIVED
@@ -149,34 +149,30 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 *   +----------------+------------+
 *             :
 */
-typedef enum PrefetchStatus
-{
-	PRFS_UNUSED = 0,			/* unused slot */
-	PRFS_REQUESTED,				/* request was written to the sendbuffer to
-								 * PS, but not necessarily flushed. all fields
-								 * except response valid */
-	PRFS_RECEIVED,				/* all fields valid */
-	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
-								 * valid */
+typedef enum PrefetchStatus {
+	PRFS_UNUSED = 0,	/* unused slot */
+	PRFS_REQUESTED,		/* request was written to the sendbuffer to PS, but not
+						 * necessarily flushed.
+						 * all fields except response valid */
+	PRFS_RECEIVED,		/* all fields valid */
+	PRFS_TAG_REMAINS,	/* only buftag and my_ring_index are still valid */
 } PrefetchStatus;

-typedef struct PrefetchRequest
-{
-	BufferTag	buftag;			/* must be first entry in the struct */
+typedef struct PrefetchRequest {
+	BufferTag	buftag; /* must be first entry in the struct */
 	XLogRecPtr	effective_request_lsn;
 	XLogRecPtr	actual_request_lsn;
-	NeonResponse *response;		/* may be null */
+	NeonResponse *response; /* may be null */
 	PrefetchStatus status;
 	uint64		my_ring_index;
 } PrefetchRequest;

 /* prefetch buffer lookup hash table */

-typedef struct PrfHashEntry
-{
+typedef struct PrfHashEntry {
 	PrefetchRequest *slot;
-	uint32		status;
-	uint32		hash;
+	uint32 status;
+	uint32 hash;
 } PrfHashEntry;

 #define SH_PREFIX			prfh
@@ -200,42 +196,36 @@ typedef struct PrfHashEntry
 /*
 * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
 * It maintains a (ring) buffer of in-flight requests and responses.
- *
+ * 
 * We maintain several indexes into the ring buffer:
 * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- *
+ * 
 * ring_unused points to the first unused slot of the buffer
 * ring_receive is the next request that is to be received
 * ring_last is the oldest received entry in the buffer
- *
+ * 
 * Apart from being an entry in the ring buffer of prefetch requests, each
 * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
 */
-typedef struct PrefetchState
-{
-	MemoryContext bufctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext errctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext hashctx;		/* context for prf_buffer */
+typedef struct PrefetchState {
+	MemoryContext bufctx; /* context for prf_buffer[].response allocations */
+	MemoryContext errctx; /* context for prf_buffer[].response allocations */
+	MemoryContext hashctx; /* context for prf_buffer */

 	/* buffer indexes */
-	uint64		ring_unused;	/* first unused slot */
-	uint64		ring_flush;		/* next request to flush */
-	uint64		ring_receive;	/* next slot that is to receive a response */
-	uint64		ring_last;		/* min slot with a response value */
+	uint64	ring_unused;		/* first unused slot */
+	uint64	ring_flush;			/* next request to flush */
+	uint64	ring_receive;		/* next slot that is to receive a response */
+	uint64	ring_last;			/* min slot with a response value */

 	/* metrics / statistics  */
-	int			n_responses_buffered;	/* count of PS responses not yet in
-										 * buffers */
-	int			n_requests_inflight;	/* count of PS requests considered in
-										 * flight */
-	int			n_unused;		/* count of buffers < unused, > last, that are
-								 * also unused */
+	int		n_responses_buffered;	/* count of PS responses not yet in buffers */
+	int		n_requests_inflight;	/* count of PS requests considered in flight */
+	int		n_unused;				/* count of buffers < unused, > last, that are also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
-	PrefetchRequest prf_buffer[];	/* prefetch buffers */
+	prfh_hash *prf_hash;
+	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

 PrefetchState *MyPState;
@@ -273,10 +263,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 static bool
 compact_prefetch_buffers(void)
 {
-	uint64		empty_ring_index = MyPState->ring_last;
-	uint64		search_ring_index = MyPState->ring_receive;
-	int			n_moved = 0;
-
+	uint64	empty_ring_index = MyPState->ring_last;
+	uint64	search_ring_index = MyPState->ring_receive;
+	int n_moved = 0;
+	
 	if (MyPState->ring_receive == MyPState->ring_last)
 		return false;

@@ -291,14 +281,15 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Here we have established: slots < search_ring_index have an unknown
-	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
-	 * are unused slots > empty_ring_index are in use, or outside our buffer's
-	 * range. ... unless search_ring_index <= ring_last
-	 *
+	 * Here we have established:
+	 *   slots < search_ring_index have an unknown state (not scanned)
+	 *   slots >= search_ring_index and <= empty_ring_index are unused
+	 *   slots > empty_ring_index are in use, or outside our buffer's range.
+	 * ... unless search_ring_index <= ring_last
+	 * 
 	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as
-	 * we hit more unused items while moving backwards through the array.
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as we hit
+	 * more unused items while moving backwards through the array.
 	 */

 	while (search_ring_index > MyPState->ring_last)
@@ -338,10 +329,7 @@ compact_prefetch_buffers(void)

 		/* empty the moved slot */
 		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag)
-		{
-			0
-		};
+		source_slot->buftag = (BufferTag) {0};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
 		source_slot->effective_request_lsn = 0;
@@ -351,8 +339,8 @@ compact_prefetch_buffers(void)
 	}

 	/*
-	 * Only when we've moved slots we can expect trailing unused slots, so
-	 * only then we clean up trailing unused slots.
+	 * Only when we've moved slots we can expect trailing unused slots,
+	 * so only then we clean up trailing unused slots.
 	 */
 	if (n_moved > 0)
 	{
@@ -369,10 +357,10 @@ readahead_buffer_resize(int newsize, void *extra)
 	uint64		end,
 				nfree = newsize;
 	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
-																	  sizeof(PrefetchRequest) * newsize
-		);
-
+	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
+		sizeof(PrefetchRequest) * newsize
+	);
+	
 	/* don't try to re-initialize if we haven't initialized yet */
 	if (MyPState == NULL)
 		return;
@@ -399,12 +387,12 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;

-	/*
+	/* 
 	 * Copy over the prefetches.
-	 *
+	 * 
 	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one
-	 * iteration on the dataset, and trivial compaction.
+	 * prefetches, but this has the benefit of only needing to do one iteration
+	 * on the dataset, and trivial compaction.
 	 */
 	for (end = MyPState->ring_unused - 1;
 		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
@@ -412,7 +400,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
 		PrefetchRequest *newslot;
-		bool		found;
+		bool	found;

 		if (slot->status == PRFS_UNUSED)
 			continue;
@@ -475,11 +463,10 @@ consume_prefetch_responses(void)
 static void
 prefetch_cleanup_trailing_unused(void)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest *slot;

-	while (MyPState->ring_last < MyPState->ring_receive)
-	{
+	while (MyPState->ring_last < MyPState->ring_receive) {
 		ring_index = MyPState->ring_last;
 		slot = GetPrfSlot(ring_index);

@@ -493,7 +480,7 @@ prefetch_cleanup_trailing_unused(void)
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
- *
+ * 
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
@@ -525,7 +512,7 @@ prefetch_wait_for(uint64 ring_index)

 /*
 * Read the response of a prefetch request into its slot.
- *
+ * 
 * The caller is responsible for making sure that the request for this buffer
 * was flushed to the PageServer.
 *
@@ -565,7 +552,7 @@ prefetch_read(PrefetchRequest *slot)

 /*
 * Disconnect hook - drop prefetches when the connection drops
- *
+ * 
 * If we don't remove the failed prefetches, we'd be serving incorrect
 * data to the smgr.
 */
@@ -576,7 +563,7 @@ prefetch_on_ps_disconnect(void)
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
-		uint64		ring_index = MyPState->ring_receive;
+		uint64 ring_index = MyPState->ring_receive;

 		slot = GetPrfSlot(ring_index);

@@ -606,7 +593,7 @@ prefetch_set_unused(uint64 ring_index)
 	PrefetchRequest *slot = GetPrfSlot(ring_index);

 	if (ring_index < MyPState->ring_last)
-		return;					/* Should already be unused */
+		return; /* Should already be unused */

 	Assert(MyPState->ring_unused > ring_index);

@@ -637,11 +624,7 @@ prefetch_set_unused(uint64 ring_index)
 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
 		prefetch_cleanup_trailing_unused();
-
-	/*
-	 * ... and try to store the buffered responses more compactly if > 12.5%
-	 * of the buffer is gaps
-	 */
+	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
 	else if (ReceiveBufferNeedsCompaction())
 		compact_prefetch_buffers();
 }
@@ -649,7 +632,7 @@ prefetch_set_unused(uint64 ring_index)
 static void
 prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	bool		found;
+	bool found;
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
@@ -667,22 +650,21 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	}
 	else
 	{
-		XLogRecPtr	lsn = neon_get_request_lsn(
-											   &request.req.latest,
-											   BufTagGetNRelFileInfo(slot->buftag),
-											   slot->buftag.forkNum,
-											   slot->buftag.blockNum
-			);
-
+		XLogRecPtr lsn = neon_get_request_lsn(
+			&request.req.latest,
+			BufTagGetNRelFileInfo(slot->buftag),
+			slot->buftag.forkNum,
+			slot->buftag.blockNum
+		);
 		/*
-		 * Note: effective_request_lsn is potentially higher than the
-		 * requested LSN, but still correct:
-		 *
+		 * Note: effective_request_lsn is potentially higher than the requested
+		 * LSN, but still correct:
+		 * 
 		 * We know there are no changes between the actual requested LSN and
 		 * the value of effective_request_lsn: If there were, the page would
-		 * have been in cache and evicted between those LSN values, which then
-		 * would have had to result in a larger request LSN for this page.
-		 *
+		 * have been in cache and evicted between those LSN values, which
+		 * then would have had to result in a larger request LSN for this page.
+		 * 
 		 * It is possible that a concurrent backend loads the page, modifies
 		 * it and then evicts it again, but the LSN of that eviction cannot be
 		 * smaller than the current WAL insert/redo pointer, which is already
@@ -719,7 +701,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 * prefetch_register_buffer() - register and prefetch buffer
 *
 * Register that we may want the contents of BufferTag in the near future.
- *
+ * 
 * If force_latest and force_lsn are not NULL, those values are sent to the
 * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
 * to fill in these values manually.
@@ -731,14 +713,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 static uint64
 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
 {
-	uint64		ring_index;
+	uint64	ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-Retry:
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -758,10 +740,7 @@ Retry:
 		 */
 		if (force_latest && force_lsn)
 		{
-			/*
-			 * if we want the latest version, any effective_request_lsn <
-			 * request lsn is OK
-			 */
+			/* if we want the latest version, any effective_request_lsn < request lsn is OK */
 			if (*force_latest)
 			{
 				if (*force_lsn > slot->effective_request_lsn)
@@ -772,11 +751,7 @@ Retry:
 				}

 			}
-
-			/*
-			 * if we don't want the latest version, only accept requests with
-			 * the exact same LSN
-			 */
+			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
 			{
 				if (*force_lsn != slot->effective_request_lsn)
@@ -823,8 +798,7 @@ Retry:
 	 */
 	if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
 	{
-		uint64		cleanup_index = MyPState->ring_last;
-
+		uint64 cleanup_index = MyPState->ring_last;
 		slot = GetPrfSlot(cleanup_index);

 		Assert(slot->status != PRFS_UNUSED);
@@ -839,10 +813,7 @@ Retry:
 		}
 		else
 		{
-			/*
-			 * We have the slot for ring_last, so that must still be in
-			 * progress
-			 */
+			/* We have the slot for ring_last, so that must still be in progress */
 			switch (slot->status)
 			{
 				case PRFS_REQUESTED:
@@ -861,8 +832,8 @@ Retry:
 	}

 	/*
-	 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-	 * we can insert the new request to it.
+	 * The next buffer pointed to by `ring_unused` is now definitely empty,
+	 * so we can insert the new request to it.
 	 */
 	ring_index = MyPState->ring_unused;
 	slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
@@ -888,10 +859,7 @@ Retry:
 	{
 		if (!page_server->flush())
 		{
-			/*
-			 * Prefetch set is reset in case of error, so we should try to
-			 * register our request once again
-			 */
+			/* Prefetch set is reset in case of error, so we should try to register our request once again */
 			goto Retry;
 		}
 		MyPState->ring_flush = MyPState->ring_unused;
@@ -903,10 +871,8 @@ Retry:
 static NeonResponse *
 page_server_request(void const *req)
 {
-	NeonResponse *resp;
-
-	do
-	{
+	NeonResponse* resp;
+	do {
 		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
@@ -918,7 +884,7 @@ page_server_request(void const *req)


 StringInfoData
-nm_pack_request(NeonRequest *msg)
+nm_pack_request(NeonRequest * msg)
 {
 	StringInfoData s;

@@ -1034,7 +1000,7 @@ nm_unpack_response(StringInfo s)
 				/* XXX:	should be varlena */
 				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
 				pq_getmsgend(s);
-
+				
 				Assert(msg_resp->tag == T_NeonGetPageResponse);

 				resp = (NeonResponse *) msg_resp;
@@ -1090,7 +1056,7 @@ nm_unpack_response(StringInfo s)

 /* dump to json for debugging / error reporting purposes */
 char *
-nm_to_string(NeonMessage *msg)
+nm_to_string(NeonMessage * msg)
 {
 	StringInfoData s;

@@ -1219,7 +1185,7 @@ nm_to_string(NeonMessage *msg)
 * directly because it skips the logging if the LSN is new enough.
 */
 static XLogRecPtr
-log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
+log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 Page page, bool page_std)
 {
 	PGAlignedBlock copied_buffer;
@@ -1242,11 +1208,11 @@ PageIsEmptyHeapPage(char *buffer)
 }

 static void
-			neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-							 char *buffer, bool force)
+				 char *buffer, bool force)
 #else
-							 const char *buffer, bool force)
+				 const char *buffer, bool force) 
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
@@ -1346,24 +1312,24 @@ static void
 void
 neon_init(void)
 {
-	Size		prfs_size;
+	Size prfs_size;

 	if (MyPState != NULL)
 		return;

 	prfs_size = offsetof(PrefetchState, prf_buffer) + (
-													   sizeof(PrefetchRequest) * readahead_buffer_size
-		);
+		sizeof(PrefetchRequest) * readahead_buffer_size
+	);

 	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-
+	
 	MyPState->n_unused = readahead_buffer_size;

 	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
 										 "NeonSMGR/prefetch",
 										 SLAB_DEFAULT_BLOCK_SIZE * 17,
 										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext, 
 											 "NeonSMGR/errors",
 											 ALLOCSET_DEFAULT_SIZES);
 	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
@@ -1603,14 +1569,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	/*
 	 * Newly created relation is empty, remember that in the relsize cache.
 	 *
-	 * Note that in REDO, this is called to make sure the relation fork
-	 * exists, but it does not truncate the relation. So, we can only update
-	 * the relsize if it didn't exist before.
-	 *
+	 * Note that in REDO, this is called to make sure the relation fork exists,
+	 * but it does not truncate the relation. So, we can only update the
+	 * relsize if it didn't exist before.
+	 * 
 	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's file length
-	 * considerations, and as file extension isn't (perfectly) logged, we need
-	 * to take care of that before we hit file size checks.
+	 * relation, as that is the primary source of truth for REDO's
+	 * file length considerations, and as file extension isn't (perfectly)
+	 * logged, we need to take care of that before we hit file size checks.
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
@@ -1686,7 +1652,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #endif
 {
 	XLogRecPtr	lsn;
-	BlockNumber n_blocks = 0;
+	BlockNumber	n_blocks = 0;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1727,10 +1693,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/*
-	 * Usually Postgres doesn't extend relation on more than one page (leaving
-	 * holes). But this rule is violated in PG-15 where
-	 * CreateAndCopyRelationData call smgrextend for destination relation n
-	 * using size of source relation
+	 * Usually Postgres doesn't extend relation on more than one page
+	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
+	 * call smgrextend for destination relation n using size of source relation
 	 */
 	n_blocks = neon_nblocks(reln, forkNum);
 	while (n_blocks < blkno)
@@ -1751,13 +1716,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
-
 	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
+	 * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
+	 * An smgr_write() call will come for the buffer later, after it has been initialized
+	 * with the real page contents, and it is eventually evicted from the buffer cache.
+	 * But we need a valid LSN to the relation metadata update now.
 	 */
 	if (lsn == InvalidXLogRecPtr)
 	{
@@ -1816,9 +1779,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forkNum),
-						InvalidBlockNumber)));
+					errmsg("cannot extend file \"%s\" beyond %u blocks",
+						   relpath(reln->smgr_rlocator, forkNum),
+						   InvalidBlockNumber)));

 	/* Don't log any pages if we're not allowed to do so. */
 	if (!XLogInsertAllowed())
@@ -1905,7 +1868,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	switch (reln->smgr_relpersistence)
 	{
-		case 0:					/* probably shouldn't happen, but ignore it */
+		case 0: /* probably shouldn't happen, but ignore it */
 		case RELPERSISTENCE_PERMANENT:
 			break;

@@ -1920,10 +1883,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;

-	tag = (BufferTag)
-	{
+	tag = (BufferTag) {
 		.forkNum = forknum,
-			.blockNum = blocknum
+		.blockNum = blocknum
 	};
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

@@ -1978,11 +1940,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 * To avoid breaking tests in the runtime please keep function signature in sync.
 */
 #if PG_MAJORVERSION_NUM < 16
-void		PGDLLEXPORT
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
 #else
-void		PGDLLEXPORT
+void PGDLLEXPORT
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
 #endif
@@ -1993,21 +1955,21 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	PrfHashEntry *entry;
 	PrefetchRequest *slot;

-	buftag = (BufferTag)
-	{
+	buftag = (BufferTag) {
 		.forkNum = forkNum,
-			.blockNum = blkno,
+		.blockNum = blkno,
 	};

 	CopyNRelFileInfoToBufTag(buftag, rinfo);

 	/*
 	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the page
-	 * after redo has decided it won't redo that page and updated the LwLSN
-	 * for that page. If we're in hot standby we need to take care that we
-	 * don't return until after REDO has finished replaying up to that LwLSN,
-	 * as the page should have been locked up to that point.
+	 * not in the shared buffers, so a concurrent process may request the
+	 * page after redo has decided it won't redo that page and updated the
+	 * LwLSN for that page.
+	 * If we're in hot standby we need to take care that we don't return
+	 * until after REDO has finished replaying up to that LwLSN, as the page
+	 * should have been locked up to that point.
 	 *
 	 * See also the description on neon_redo_read_buffer_filter below.
 	 *
@@ -2015,7 +1977,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * concurrent failed read IOs. Those IOs should never have a request_lsn
 	 * that is as large as the WAL record we're currently replaying, if it
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found.
+	 * value of the LwLsn cache when the entry is not found. 
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
 		XLogWaitForReplayOf(request_lsn);
@@ -2033,14 +1995,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
 		}
-		else					/* the current prefetch LSN is not large
-								 * enough, so drop the prefetch */
+		else /* the current prefetch LSN is not large enough, so drop the prefetch */
 		{
 			/*
 			 * We can't drop cache for not-yet-received requested items. It is
-			 * unlikely this happens, but it can happen if prefetch distance
-			 * is large enough and a backend didn't consume all prefetch
-			 * requests.
+			 * unlikely this happens, but it can happen if prefetch distance is
+			 * large enough and a backend didn't consume all prefetch requests.
 			 */
 			if (slot->status == PRFS_REQUESTED)
 			{
@@ -2067,11 +2027,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		else
 		{
 			/*
-			 * Empty our reference to the prefetch buffer's hash entry. When
-			 * we wait for prefetches, the entry reference is invalidated by
-			 * potential updates to the hash, and when we reconnect to the
-			 * pageserver the prefetch we're waiting for may be dropped, in
-			 * which case we need to retry and take the branch above.
+			 * Empty our reference to the prefetch buffer's hash entry.
+			 * When we wait for prefetches, the entry reference is invalidated by 
+			 * potential updates to the hash, and when we reconnect to the 
+			 * pageserver the prefetch we're waiting for may be dropped,
+			 * in which case we need to retry and take the branch above.
 			 */
 			entry = NULL;
 		}
@@ -2119,11 +2079,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 *	neon_read() -- Read the specified block from a relation.
 */
 void
-			neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #if PG_MAJORVERSION_NUM < 16
-					  char *buffer)
+		  char *buffer)
 #else
-					  void *buffer)
+		  void *buffer)
 #endif
 {
 	bool		latest;
@@ -2258,11 +2218,11 @@ hexdump_page(char *page)
 *		use mdextend().
 */
 void
-			neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 #if PG_MAJORVERSION_NUM < 16
-					   char *buffer, bool skipFsync)
+		   char *buffer, bool skipFsync)
 #else
-					   const void *buffer, bool skipFsync)
+		   const void *buffer, bool skipFsync)
 #endif
 {
 	XLogRecPtr	lsn;
@@ -2764,7 +2724,7 @@ smgr_init_neon(void)

 /*
 * Return whether we can skip the redo for this block.
- *
+ * 
 * The conditions for skipping the IO are:
 *
 * - The block is not in the shared buffers, and
@@ -2803,7 +2763,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	XLogRecPtr	end_recptr = record->EndRecPtr;
 	NRelFileInfo rinfo;
 	ForkNumber	forknum;
-	BlockNumber blkno;
+	BlockNumber	blkno;
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
@@ -2823,8 +2783,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	/*
 	 * Out of an abundance of caution, we always run redo on shared catalogs,
-	 * regardless of whether the block is stored in shared buffers. See also
-	 * this function's top comment.
+	 * regardless of whether the block is stored in shared buffers.
+	 * See also this function's top comment.
 	 */
 	if (!OidIsValid(NInfoGetDbOid(rinfo)))
 		return false;
@@ -2850,9 +2810,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	/* In both cases st lwlsn past this WAL record */
 	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

-	/*
-	 * we don't have the buffer in memory, update lwLsn past this record, also
-	 * evict page fro file cache
+	/* we don't have the buffer in memory, update lwLsn past this record,
+	 * also evict page fro file cache
 	 */
 	if (no_redo_needed)
 		lfc_evict(rinfo, forknum, blkno);
@@ -2872,11 +2831,11 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/*
-		 * Size was not cached. We populate the cache now, with the size of
-		 * the relation measured after this WAL record is applied.
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
 		 *
-		 * This length is later reused when we open the smgr to read the
-		 * block, which is fine and expected.
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
 		 */

 		NeonResponse *response;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -43,6 +43,7 @@

 /* Prototypes for private functions */
 static void WalProposerLoop(WalProposer *wp);
+static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
 static void ShutdownConnection(Safekeeper *sk);
 static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(WalProposer *wp, TimestampTz now);
@@ -75,9 +76,10 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 static int	CompareLsn(const void *a, const void *b);
-static char *FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state);
+static char *FormatSafekeeperState(SafekeeperState state);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
-static char *FormatEvents(uint32 events);
+static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
+static char *FormatEvents(WalProposer *wp, uint32 events);

 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -123,7 +125,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		}

 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->safekeeper[wp->n_safekeepers].xlogreader = NULL;
+		wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
+		wp->safekeeper[wp->n_safekeepers].flushWrite = false;
 		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		wp->n_safekeepers += 1;
@@ -175,7 +178,7 @@ WalProposerFree(WalProposer *wp)
 	if (wp->propTermHistory.entries != NULL)
 		pfree(wp->propTermHistory.entries);
 	wp->propTermHistory.entries = NULL;
-
+	
 	pfree(wp);
 }

@@ -272,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state), wp->config->safekeeper_connection_timeout);
+						 sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -300,6 +303,43 @@ WalProposerLoop(WalProposer *wp)
 		WalProposerPoll(wp);
 }

+/*
+ * Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(Safekeeper *to_remove)
+{
+	WalProposer *wp = to_remove->wp;
+
+	/* Remove the existing event set, assign sk->eventPos = -1 */
+	wp->api.free_event_set(wp);
+	/* Re-initialize it without adding any safekeeper events */
+	wp->api.init_event_set(wp);
+
+	/*
+	 * loop through the existing safekeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		uint32		desired_events = WL_NO_EVENTS;
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk == to_remove)
+			continue;
+
+		/* If this safekeeper isn't offline, add an event for it! */
+		if (sk->state != SS_OFFLINE)
+		{
+			desired_events = SafekeeperStateDesiredEvents(sk->state);
+			/* will set sk->eventPos */
+			wp->api.add_safekeeper_event_set(sk, desired_events);
+		}
+	}
+}

 /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
@@ -307,13 +347,14 @@ ShutdownConnection(Safekeeper *sk)
 {
 	sk->wp->api.conn_finish(sk);
 	sk->state = SS_OFFLINE;
+	sk->flushWrite = false;
 	sk->streamingAt = InvalidXLogRecPtr;

 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;

-	sk->wp->api.rm_safekeeper_event_set(sk);
+	HackyRemoveWalProposerEvent(sk);
 }

 /*
@@ -354,7 +395,7 @@ ResetConnection(Safekeeper *sk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+			 sk->host, sk->port, wp->api.conn_error_message(sk));

 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -431,6 +472,8 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
+	WalProposer *wp = sk->wp;
+
 	/*
 	 * Sanity check. We assume further down that the operations don't block
 	 * because the socket is ready.
@@ -446,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-						sk->host, sk->port);
+				 sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */

@@ -482,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state, sk->active_state));
+				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -511,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state, sk->active_state));
+				 sk->port, FormatSafekeeperState(sk->state));
 			ResetConnection(sk);
 			return;

@@ -537,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	{
 		case WP_CONN_POLLING_OK:
 			walprop_log(LOG, "connected with node %s:%s", sk->host,
-						sk->port);
+				 sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 			/*
@@ -561,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)

 		case WP_CONN_POLLING_FAILED:
 			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+				 sk->host, sk->port, wp->api.conn_error_message(sk));

 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -577,7 +620,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	 * Because PQconnectPoll can change the socket, we have to un-register the
 	 * old event and re-register an event on the new socket.
 	 */
-	wp->api.rm_safekeeper_event_set(sk);
+	HackyRemoveWalProposerEvent(sk);
 	wp->api.add_safekeeper_event_set(sk, new_events);

 	/* If we successfully connected, send START_WAL_PUSH query */
@@ -598,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
 		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+			 sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -635,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)

 		case WP_EXEC_FAILED:
 			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+				 sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;

@@ -646,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
 			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-						sk->host, sk->port);
+				 sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -715,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	{
 		/* Another compute with higher term is running. */
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->greetResponse.term, wp->propTerm);
+			 sk->host, sk->port,
+			 sk->greetResponse.term, wp->propTerm);
 	}

 	/*
@@ -774,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	walprop_log(LOG,
-				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+		 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		 sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		 LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+		 LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		 LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -789,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
 		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->voteResponse.term, wp->propTerm);
+			 sk->host, sk->port,
+			 sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);

@@ -834,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
 	if (wp->truncateLsn < wp->propEpochStartLsn)
 	{
 		walprop_log(LOG,
-					"start recovery because truncateLsn=%X/%X is not "
-					"equal to epochStartLsn=%X/%X",
-					LSN_FORMAT_ARGS(wp->truncateLsn),
-					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+			 "start recovery because truncateLsn=%X/%X is not "
+			 "equal to epochStartLsn=%X/%X",
+			 LSN_FORMAT_ARGS(wp->truncateLsn),
+			 LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 		/* Perform recovery */
 		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
 			walprop_log(FATAL, "Failed to recover state");
@@ -947,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
 					walprop_log(WARNING,
-								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-								LSN_FORMAT_ARGS(wp->timelineStartLsn),
-								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						 LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						 LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -995,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;

 	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-				wp->quorum,
-				wp->propTerm,
-				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-				LSN_FORMAT_ARGS(wp->truncateLsn));
+		 wp->quorum,
+		 wp->propTerm,
+		 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+		 wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		 LSN_FORMAT_ARGS(wp->truncateLsn));

 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1027,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
 											walprop_shared->mineLastElectedTerm)))
 			{
 				walprop_log(PANIC,
-							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					 LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+					 LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}

 	/*
-	 * WalProposer has just elected itself and initialized history, so we can
-	 * call election callback. Usually it updates truncateLsn to fetch WAL for
-	 * logical replication.
+	 * WalProposer has just elected itself and initialized history, so
+	 * we can call election callback. Usually it updates truncateLsn to
+	 * fetch WAL for logical replication.
 	 */
 	wp->api.after_election(wp);
 }
@@ -1061,10 +1104,6 @@ SendProposerElected(Safekeeper *sk)
 	term_t		lastCommonTerm;
 	int			i;

-	/* Now that we are ready to send it's a good moment to create WAL reader */
-	Assert(!sk->xlogreader);
-	wp->api.wal_reader_allocate(sk);
-
 	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history
 	 * and proposer's, searching for the divergence point.
@@ -1116,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
 			sk->startStreamingAt = wp->truncateLsn;

 			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-						LSN_FORMAT_ARGS(sk->startStreamingAt));
+				 sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
+				 LSN_FORMAT_ARGS(sk->startStreamingAt));
 		}
 	}
 	else
@@ -1151,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)

 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
 	walprop_log(LOG,
-				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));

 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1184,7 +1223,6 @@ StartStreaming(Safekeeper *sk)
 	 * once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
-	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;

 	/* event set will be updated inside SendMessageToNode */
@@ -1243,13 +1281,9 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	WalProposer *wp = sk->wp;

-	/*
-	 * Note: we don't known which socket awoke us (sk or nwr). However, as
-	 * SendAppendRequests always tries to send at least one msg in
-	 * SS_ACTIVE_SEND be careful not to go there if are only after sk
-	 * response, otherwise it'd create busy loop of pings.
-	 */
-	if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
+	uint32		newEvents = WL_SOCKET_READABLE;
+
+	if (events & WL_SOCKET_WRITEABLE)
 		if (!SendAppendRequests(sk))
 			return;

@@ -1257,26 +1291,28 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 		if (!RecvAppendResponses(sk))
 			return;

-	if (events & WL_SOCKET_CLOSED)
-	{
-		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
-					sk->host, sk->port);
-		ShutdownConnection(sk);
-		return;
-	}
+	/*
+	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
+	 * in the buffer.
+	 *
+	 * LSN comparison checks if we have pending unsent messages. This check
+	 * isn't necessary now, because we always send append messages immediately
+	 * after arrival. But it's good to have it here in case we change this
+	 * behavior in the future.
+	 */
+	if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
+		newEvents |= WL_SOCKET_WRITEABLE;

-	/* configures event set for yield whatever is the substate */
-	wp->api.active_state_update_event_set(sk);
+	wp->api.update_event_set(sk, newEvents);
 }

 /*
 * Send WAL messages starting from sk->streamingAt until the end or non-writable
- * socket or neon_walreader blocks, whichever comes first; active_state is
- * updated accordingly. Caller should take care of updating event set. Even if
- * no unsent WAL is available, at least one empty message will be sent as a
- * heartbeat, if socket is ready.
+ * socket, whichever comes first. Caller should take care of updating event set.
+ * Even if no unsent WAL is available, at least one empty message will be sent
+ * as a heartbeat, if socket is ready.
 *
- * Resets state and kills the connections if any error on them is encountered.
+ * Can change state if Async* functions encounter errors and reset connection.
 * Returns false in this case, true otherwise.
 */
 static bool
@@ -1284,11 +1320,11 @@ SendAppendRequests(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
 	XLogRecPtr	endLsn;
+	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 	bool		sentAnything = false;
-	AppendRequestHeader *req;

-	if (sk->active_state == SS_ACTIVE_FLUSH)
+	if (sk->flushWrite)
 	{
 		if (!AsyncFlush(sk))

@@ -1299,99 +1335,76 @@ SendAppendRequests(Safekeeper *sk)
 			return sk->state == SS_ACTIVE;

 		/* Event set will be updated in the end of HandleActiveState */
-		sk->active_state = SS_ACTIVE_SEND;
+		sk->flushWrite = false;
 	}

 	while (sk->streamingAt != wp->availableLsn || !sentAnything)
 	{
-		if (sk->active_state == SS_ACTIVE_SEND)
+		sentAnything = true;
+
+		endLsn = sk->streamingAt;
+		endLsn += MAX_SEND_SIZE;
+
+		/* if we went beyond available WAL, back off */
+		if (endLsn > wp->availableLsn)
 		{
-			sentAnything = true;
+			endLsn = wp->availableLsn;
+		}

-			endLsn = sk->streamingAt;
-			endLsn += MAX_SEND_SIZE;
+		req = &sk->appendRequest;
+		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);

-			/* if we went beyond available WAL, back off */
-			if (endLsn > wp->availableLsn)
-			{
-				endLsn = wp->availableLsn;
-			}
-
-			req = &sk->appendRequest;
-			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
-
-			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 						req->endLsn - req->beginLsn,
 						LSN_FORMAT_ARGS(req->beginLsn),
 						LSN_FORMAT_ARGS(req->endLsn),
 						LSN_FORMAT_ARGS(req->commitLsn),
 						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);

-			resetStringInfo(&sk->outbuf);
+		resetStringInfo(&sk->outbuf);

-			/* write AppendRequest header */
-			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
-			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
-			sk->active_state = SS_ACTIVE_READ_WAL;
-		}
+		/* write AppendRequest header */
+		appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));

-		if (sk->active_state == SS_ACTIVE_READ_WAL)
+		/* write the WAL itself */
+		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+		/* wal_read will raise error on failure */
+		wp->api.wal_read(sk,
+						 &sk->outbuf.data[sk->outbuf.len],
+						 req->beginLsn,
+						 req->endLsn - req->beginLsn);
+		sk->outbuf.len += req->endLsn - req->beginLsn;
+
+		writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
+
+		/* Mark current message as sent, whatever the result is */
+		sk->streamingAt = endLsn;
+
+		switch (writeResult)
 		{
-			req = &sk->appendRequest;
+			case PG_ASYNC_WRITE_SUCCESS:
+				/* Continue writing the next message */
+				break;

-			switch (wp->api.wal_read(sk,
-									 &sk->outbuf.data[sk->outbuf.len],
-									 req->beginLsn,
-									 req->endLsn - req->beginLsn))
-			{
-				case NEON_WALREAD_SUCCESS:
-					break;
-				case NEON_WALREAD_WOULDBLOCK:
-					return true;
-				case NEON_WALREAD_ERROR:
-					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
-								sk->host, sk->port,
-								NeonWALReaderErrMsg(sk->xlogreader));
-					ShutdownConnection(sk);
-					return false;
-				default:
-					Assert(false);
-			}
+			case PG_ASYNC_WRITE_TRY_FLUSH:

-			sk->outbuf.len += req->endLsn - req->beginLsn;
+				/*
+				 * * We still need to call PQflush some more to finish the
+				 * job. Caller function will handle this by setting right
+				 * event* set.
+				 */
+				sk->flushWrite = true;
+				return true;

-			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
-
-			/* Mark current message as sent, whatever the result is */
-			sk->streamingAt = req->endLsn;
-
-			switch (writeResult)
-			{
-				case PG_ASYNC_WRITE_SUCCESS:
-					/* Continue writing the next message */
-					sk->active_state = SS_ACTIVE_SEND;
-					break;
-
-				case PG_ASYNC_WRITE_TRY_FLUSH:
-
-					/*
-					 * We still need to call PQflush some more to finish the
-					 * job. Caller function will handle this by setting right
-					 * event set.
-					 */
-					sk->active_state = SS_ACTIVE_FLUSH;
-					return true;
-
-				case PG_ASYNC_WRITE_FAIL:
-					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-								sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
-								wp->api.conn_error_message(sk));
-					ShutdownConnection(sk);
-					return false;
-				default:
-					Assert(false);
-					return false;
-			}
+			case PG_ASYNC_WRITE_FAIL:
+				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
+					 sk->host, sk->port, FormatSafekeeperState(sk->state),
+					 wp->api.conn_error_message(sk));
+				ShutdownConnection(sk);
+				return false;
+			default:
+				Assert(false);
+				return false;
 		}
 	}

@@ -1401,7 +1414,7 @@ SendAppendRequests(Safekeeper *sk)
 /*
 * Receive and process all available feedback.
 *
- * Resets state and kills the connection if any error on it is encountered.
+ * Can change state if Async* functions encounter errors and reset connection.
 * Returns false in this case, true otherwise.
 *
 * NB: This function can call SendMessageToNode and produce new messages.
@@ -1425,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
 			break;

 		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-					sk->appendResponse.term,
-					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-					sk->host, sk->port);
+						sk->appendResponse.term,
+						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+						sk->host, sk->port);

 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/* Another compute with higher term is running. */
 			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-						sk->host, sk->port,
-						sk->appendResponse.term, wp->propTerm);
+				 sk->host, sk->port,
+				 sk->appendResponse.term, wp->propTerm);
 		}

 		readAnything = true;
@@ -1480,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-						rf->currentClusterSize);
+				 rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
@@ -1488,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->last_received_lsn));
+				 LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
@@ -1496,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
@@ -1504,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
 			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1517,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
 				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-							rf->replytime, replyTimeStr);
+					 rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
 			}
@@ -1582,53 +1595,6 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 	return responses[wp->n_safekeepers - wp->quorum];
 }

-/*
- * Return safekeeper with active connection from which WAL can be downloaded, or
- * none if it doesn't exist. donor_lsn is set to end position of the donor to
- * the best of our knowledge.
- */
-Safekeeper *
-GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
-{
-	*donor_lsn = InvalidXLogRecPtr;
-	Safekeeper *donor = NULL;
-	int			i;
-
-	if (wp->n_votes < wp->quorum)
-	{
-		walprop_log(WARNING, "GetDonor called before elections are won");
-		return NULL;
-	}
-
-	/*
-	 * First, consider node which had determined our term start LSN as we know
-	 * about its position immediately after election before any feedbacks are
-	 * sent.
-	 */
-	if (wp->safekeeper[wp->donor].state >= SS_IDLE)
-	{
-		donor = &wp->safekeeper[wp->donor];
-		*donor_lsn = wp->propEpochStartLsn;
-	}
-
-	/*
-	 * But also check feedbacks from all nodes with live connections and take
-	 * the highest one. Note: if node sends feedbacks it already processed
-	 * elected message so its term is fine.
-	 */
-	for (i = 0; i < wp->n_safekeepers; i++)
-	{
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
-		{
-			donor = sk;
-			*donor_lsn = sk->appendResponse.flushLsn;
-		}
-	}
-	return donor;
-}
-
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
@@ -1734,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)

 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk->state, sk->active_state),
-						wp->api.conn_error_message(sk));
+				 sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1774,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk->state, sk->active_state));
+			 sk->port, FormatSafekeeperState(sk->state));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1845,14 +1811,13 @@ static bool
 BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	WalProposer *wp = sk->wp;
-	uint32		sk_events;
-	uint32		nwr_events;
+	uint32		events;

 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
-					wp->api.conn_error_message(sk));
+			 sk->host, sk->port, FormatSafekeeperState(sk->state),
+			 wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1863,15 +1828,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-	/*
-	 * nwr_events is relevant only during SS_ACTIVE which doesn't user
-	 * BlockingWrite
-	 */
-	Assert(!nwr_events);
-	if (sk_events)
-		wp->api.update_event_set(sk, sk_events);
+	events = SafekeeperStateDesiredEvents(success_state);
+	if (events)
+		wp->api.update_event_set(sk, events);

 	return true;
 }
@@ -1904,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
-						wp->api.conn_error_message(sk));
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1943,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state),
-						wp->api.conn_error_message(sk));
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1973,14 +1932,14 @@ CompareLsn(const void *a, const void *b)
 *
 * The strings are intended to be used as a prefix to "state", e.g.:
 *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state, sk->active_state));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
 *
 * If this sort of phrasing doesn't fit the message, instead use something like:
 *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state, sk->active_state));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
 */
 static char *
-FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
+FormatSafekeeperState(SafekeeperState state)
 {
 	char	   *return_val = NULL;

@@ -2012,18 +1971,7 @@ FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
 			return_val = "idle";
 			break;
 		case SS_ACTIVE:
-			switch (active_state)
-			{
-				case SS_ACTIVE_SEND:
-					return_val = "active send";
-					break;
-				case SS_ACTIVE_READ_WAL:
-					return_val = "active read WAL";
-					break;
-				case SS_ACTIVE_FLUSH:
-					return_val = "active flush";
-					break;
-			}
+			return_val = "active";
 			break;
 	}

@@ -2036,20 +1984,22 @@ FormatSafekeeperState(SafekeeperState state, SafekeeperActiveState active_state)
 static void
 AssertEventsOkForState(uint32 events, Safekeeper *sk)
 {
-	uint32		sk_events;
-	uint32		nwr_events;
-	uint32		expected;
+	WalProposer *wp = sk->wp;
+	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
+
+	/*
+	 * The events are in-line with what we're expecting, under two conditions:
+	 * (a) if we aren't expecting anything, `events` has no read- or
+	 * write-ready component. (b) if we are expecting something, there's
+	 * overlap (i.e. `events & expected != 0`)
+	 */
 	bool		events_ok_for_state;	/* long name so the `Assert` is more
 										 * clear later */

-	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-	/*
-	 * Without one more level of notify target indirection we have no way to
-	 * distinguish which socket woke up us, so just union expected events.
-	 */
-	expected = sk_events | nwr_events;
-	events_ok_for_state = ((events & expected) != 0);
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);

 	if (!events_ok_for_state)
 	{
@@ -2058,37 +2008,36 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state, sk->active_state));
+			 FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }

-/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
- * (nwr_events) sockets a safekeeper in this state should be waiting on.
+/* Returns the set of events a safekeeper in this state should be waiting on
 *
 * This will return WL_NO_EVENTS (= 0) for some events. */
-void
-SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
+static uint32
+SafekeeperStateDesiredEvents(SafekeeperState state)
 {
-	*nwr_events = 0;			/* nwr_events is empty for most states */
+	uint32		result = WL_NO_EVENTS;

 	/* If the state doesn't have a modifier, we can check the base state */
-	switch (sk->state)
+	switch (state)
 	{
 			/* Connecting states say what they want in the name */
 		case SS_CONNECTING_READ:
-			*sk_events = WL_SOCKET_READABLE;
-			return;
+			result = WL_SOCKET_READABLE;
+			break;
 		case SS_CONNECTING_WRITE:
-			*sk_events = WL_SOCKET_WRITEABLE;
-			return;
+			result = WL_SOCKET_WRITEABLE;
+			break;

 			/* Reading states need the socket to be read-ready to continue */
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-			*sk_events = WL_SOCKET_READABLE;
-			return;
+			result = WL_SOCKET_READABLE;
+			break;

 			/*
 			 * Idle states use read-readiness as a sign that the connection
@@ -2096,62 +2045,32 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even
 			 */
 		case SS_VOTING:
 		case SS_IDLE:
-			*sk_events = WL_SOCKET_READABLE;
-			return;
+			result = WL_SOCKET_READABLE;
+			break;

+			/*
+			 * Flush states require write-ready for flushing. Active state
+			 * does both reading and writing.
+			 *
+			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
+			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
+			 */
 		case SS_SEND_ELECTED_FLUSH:
-			*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			return;
-
 		case SS_ACTIVE:
-			switch (sk->active_state)
-			{
-					/*
-					 * Everything is sent; we just wait for sk responses and
-					 * latch.
-					 *
-					 * Note: this assumes we send all available WAL to
-					 * safekeeper in one wakeup (unless it blocks). Otherwise
-					 * we would want WL_SOCKET_WRITEABLE here to finish the
-					 * work.
-					 */
-				case SS_ACTIVE_SEND:
-					*sk_events = WL_SOCKET_READABLE;
-					if (NeonWALReaderEvents(sk->xlogreader))
-						*nwr_events = WL_SOCKET_CLOSED; /* c.f.
-														 * walprop_pg_active_state_update_event_set */
-					return;
-
-					/*
-					 * Waiting for neon_walreader socket, but we still read
-					 * responses from sk socket.
-					 */
-				case SS_ACTIVE_READ_WAL:
-					*sk_events = WL_SOCKET_READABLE;
-					*nwr_events = NeonWALReaderEvents(sk->xlogreader);
-					return;
-
-					/*
-					 * Need to flush the sk socket, so ignore neon_walreader
-					 * one and set write interest on sk.
-					 */
-				case SS_ACTIVE_FLUSH:
-					*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-					if (NeonWALReaderEvents(sk->xlogreader))
-						*nwr_events = WL_SOCKET_CLOSED; /* c.f.
-														 * walprop_pg_active_state_update_event_set */
-					return;
-			}
-			return;
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;

 			/* The offline state expects no events. */
 		case SS_OFFLINE:
-			*sk_events = 0;
-			return;
+			result = WL_NO_EVENTS;
+			break;

 		default:
 			Assert(false);
+			break;
 	}
+
+	return result;
 }

 /* Returns a human-readable string corresponding to the event set
@@ -2162,7 +2081,7 @@ SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_even
 * The string should not be freed. It should also not be expected to remain the same between
 * function calls. */
 static char *
-FormatEvents(uint32 events)
+FormatEvents(WalProposer *wp, uint32 events)
 {
 	static char return_str[8];

@@ -2192,7 +2111,7 @@ FormatEvents(uint32 events)
 	if (events & (~all_flags))
 	{
 		walprop_log(WARNING, "Event formatting found unexpected component %d",
-					events & (~all_flags));
+			 events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -10,9 +10,6 @@
 #include "utils/uuid.h"
 #include "replication/walreceiver.h"

-#include "libpqwalproposer.h"
-#include "neon_walreader.h"
-
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2

@@ -25,9 +22,43 @@
 */
 #define WL_NO_EVENTS 0

-struct WalProposerConn;			/* Defined in libpqwalproposer.h */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
 /*
 * WAL safekeeper state, which is used to wait for some event.
 *
@@ -104,40 +135,6 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;

-/*
- * Sending WAL substates of SS_ACTIVE.
- */
-typedef enum
-{
-	/*
-	 * We are ready to send more WAL, waiting for latch set to learn about
-	 * more WAL becoming available (or just a timeout to send heartbeat).
-	 */
-	SS_ACTIVE_SEND,
-
-	/*
-	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
-	 * send to this safekeeper.
-	 *
-	 * Note: socket management is done completely inside walproposer_pg for
-	 * simplicity, and thus simulation doesn't test it. Which is fine as
-	 * simulation is mainly aimed at consensus checks, not waiteventset
-	 * management.
-	 *
-	 * Also, while in this state we don't touch safekeeper socket, so in
-	 * theory it might close connection as inactive. This can be addressed if
-	 * needed; however, while fetching WAL we should regularly send it, so the
-	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
-	 * walreader socket), but similarly shouldn't be a problem.
-	 */
-	SS_ACTIVE_READ_WAL,
-
-	/*
-	 * Waiting for write readiness to flush the socket.
-	 */
-	SS_ACTIVE_FLUSH,
-} SafekeeperActiveState;
-
 /* Consensus logical timestamp. */
 typedef uint64 term_t;

@@ -346,11 +343,12 @@ typedef struct Safekeeper
 	 */
 	XLogRecPtr	startStreamingAt;

+	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
+								 * to flush pending messages */
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */

 	SafekeeperState state;		/* safekeeper state machine state */
-	SafekeeperActiveState active_state;
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
@@ -358,8 +356,7 @@ typedef struct Safekeeper


 	/* postgres-specific fields */
-#ifndef WALPROPOSER_LIB
-
+	#ifndef WALPROPOSER_LIB
 	/*
 	 * postgres protocol connection to the WAL acceptor
 	 *
@@ -371,29 +368,23 @@ typedef struct Safekeeper
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	NeonWALReader *xlogreader;
+	XLogReaderState *xlogreader;

 	/*
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
-
-	/*
-	 * Neon WAL reader position in wait event set, or -1 if no socket.
-	 */
-	int			nwrEventPos;
-#endif
+	#endif


 	/* WalProposer library specifics */
-#ifdef WALPROPOSER_LIB
-
+	#ifdef WALPROPOSER_LIB
 	/*
 	 * Buffer for incoming messages. Usually Rust vector is stored here.
 	 * Caller is responsible for freeing the buffer.
 	 */
 	StringInfoData inbuf;
-#endif
+	#endif
 } Safekeeper;

 /* Re-exported PostgresPollingStatusType */
@@ -410,6 +401,31 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;

+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -456,7 +472,7 @@ typedef struct walproposer_api
 	WalProposerConnStatusType (*conn_status) (Safekeeper *sk);

 	/* Start the connection, aka PQconnectStart. */
-	void		(*conn_connect_start) (Safekeeper *sk);
+	void (*conn_connect_start) (Safekeeper *sk);

 	/* Poll an asynchronous connection, aka PQconnectPoll. */
 	WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
@@ -474,7 +490,7 @@ typedef struct walproposer_api
 	void		(*conn_finish) (Safekeeper *sk);

 	/*
-	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData.
+	 * Try to read CopyData message from the safekeeper, aka PQgetCopyData. 
 	 *
 	 * On success, the data is placed in *buf. It is valid until the next call
 	 * to this function.
@@ -491,10 +507,13 @@ typedef struct walproposer_api
 	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);

 	/* Read WAL from disk to buf. */
-	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
+	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
-	void		(*wal_reader_allocate) (Safekeeper *sk);
+	void (*wal_reader_allocate) (Safekeeper *sk);
+
+	/* Deallocate event set. */
+	void		(*free_event_set) (WalProposer *wp);

 	/* Initialize event set. */
 	void		(*init_event_set) (WalProposer *wp);
@@ -502,15 +521,9 @@ typedef struct walproposer_api
 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);

-	/* Configure wait event set for yield in SS_ACTIVE. */
-	void		(*active_state_update_event_set) (Safekeeper *sk);
-
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);

-	/* Remove safekeeper connection from event set */
-	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
-
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -559,7 +572,7 @@ typedef struct walproposer_api
 	/*
 	 * Called right after the proposer was elected, but before it started
 	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
+	 * 
 	 * Used by logical replication to update truncateLsn.
 	 */
 	void		(*after_election) (WalProposer *wp);
@@ -613,10 +626,10 @@ typedef struct WalProposerConfig
 	uint64		systemId;

 	/* Will be passed to safekeepers in greet request. */
-	TimeLineID	pgTimeline;
+	TimeLineID  pgTimeline;

 #ifdef WALPROPOSER_LIB
-	void	   *callback_data;
+	void *callback_data;
 #endif
 } WalProposerConfig;

@@ -696,19 +709,11 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);

-/*
- * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
- * recreate set from scratch, hence the export.
- */
-extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
-extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);

-
-#define WPEVENT		1337		/* special log level for walproposer internal
-								 * events */
+#define WPEVENT		1337	/* special log level for walproposer internal events */

 #ifdef WALPROPOSER_LIB
-void		WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
+void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
 #define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
 #define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -9,9 +9,8 @@
 #include "utils/datetime.h"
 #include "miscadmin.h"

-void
-ExceptionalCondition(const char *conditionName,
-					 const char *fileName, int lineNumber)
+void ExceptionalCondition(const char *conditionName,
+						  const char *fileName, int lineNumber)
 {
 	fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
 			fileName, lineNumber, conditionName);
@@ -170,18 +169,17 @@ timestamptz_to_str(TimestampTz t)

 bool
 TimestampDifferenceExceeds(TimestampTz start_time,
-						   TimestampTz stop_time,
-						   int msec)
+								TimestampTz stop_time,
+								int msec)
 {
 	TimestampTz diff = stop_time - start_time;
-
 	return (diff >= msec * INT64CONST(1000));
 }

 void
-WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
+WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
 {
-	char		buf[1024];
+	char buf[1024];
 	va_list		args;

 	fmt = _(fmt);
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -43,12 +43,9 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"

-#include "libpq-fe.h"
-
-#include "libpqwalproposer.h"
 #include "neon.h"
-#include "neon_walreader.h"
 #include "walproposer.h"
+#include "libpq-fe.h"

 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
@@ -94,10 +91,6 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

-static void add_nwr_event_set(Safekeeper *sk, uint32 events);
-static void update_nwr_event_set(Safekeeper *sk, uint32 events);
-static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
-
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -548,6 +541,14 @@ walprop_pg_load_libpqwalreceiver(void)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }

+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received data from walprop_async_read */
+};
+
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -585,17 +586,16 @@ walprop_status(Safekeeper *sk)
 	}
 }

-WalProposerConn *
-libpqwp_connect_start(char *conninfo)
+static void
+walprop_connect_start(Safekeeper *sk)
 {
-
 	PGconn	   *pg_conn;
-	WalProposerConn *conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;

+	Assert(sk->conn == NULL);

 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -614,7 +614,7 @@ libpqwp_connect_start(char *conninfo)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = conninfo;
+	values[n] = sk->conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -635,20 +635,11 @@ libpqwp_connect_start(char *conninfo)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
+	sk->conn = palloc(sizeof(WalProposerConn));
+	sk->conn->pg_conn = pg_conn;
+	sk->conn->is_nonblocking = false;	/* connections always start in blocking
 									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-static void
-walprop_connect_start(Safekeeper *sk)
-{
-	Assert(sk->conn == NULL);
-	sk->conn = libpqwp_connect_start(sk->conninfo);
-
+	sk->conn->recvbuf = NULL;
 }

 static WalProposerConnectPollStatusType
@@ -692,33 +683,26 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }

-extern bool
-libpqwp_send_query(WalProposerConn *conn, char *query)
+static bool
+walprop_send_query(Safekeeper *sk, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(conn, false))
+	if (!ensure_nonblocking_status(sk->conn, false))
 		return false;

 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
+	if (!PQsendQuery(sk->conn->pg_conn, query))
 		return false;

 	return true;
 }

-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+static WalProposerExecStatusType
+walprop_get_query_result(Safekeeper *sk)
 {
-	return libpqwp_send_query(sk->conn, query);
-}
-
-WalProposerExecStatusType
-libpqwp_get_query_result(WalProposerConn *conn)
-{
-
 	PGresult   *result;
 	WalProposerExecStatusType return_val;

@@ -726,14 +710,14 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	char	   *unexpected_success = NULL;

 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
+	if (!PQconsumeInput(sk->conn->pg_conn))
 		return WP_EXEC_FAILED;

-	if (PQisBusy(conn->pg_conn))
+	if (PQisBusy(sk->conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;


-	result = PQgetResult(conn->pg_conn);
+	result = PQgetResult(sk->conn->pg_conn);

 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -794,12 +778,6 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	return return_val;
 }

-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
-{
-	return libpqwp_get_query_result(sk->conn);
-}
-
 static pgsocket
 walprop_socket(Safekeeper *sk)
 {
@@ -812,21 +790,38 @@ walprop_flush(Safekeeper *sk)
 	return (PQflush(sk->conn->pg_conn));
 }

-/* Like libpqrcv_receive. *buf is valid until the next call. */
-PGAsyncReadResult
-libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
+static void
+walprop_finish(Safekeeper *sk)
 {
+	if (!sk->conn)
+		return;

+	if (sk->conn->recvbuf != NULL)
+		PQfreemem(sk->conn->recvbuf);
+	PQfinish(sk->conn->pg_conn);
+	pfree(sk->conn);
+	sk->conn = NULL;
+}
+
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
+{
 	int			result;

-	if (conn->recvbuf != NULL)
+	if (sk->conn->recvbuf != NULL)
 	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
+		PQfreemem(sk->conn->recvbuf);
+		sk->conn->recvbuf = NULL;
 	}

 	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
+	if (!PQconsumeInput(sk->conn->pg_conn))
 	{
 		*amount = 0;
 		*buf = NULL;
@@ -844,7 +839,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
 	{
 		case 0:
 			*amount = 0;
@@ -859,7 +854,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));

 				if (status != PGRES_FATAL_ERROR)
 					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
@@ -880,23 +875,11 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
-			*buf = conn->recvbuf;
+			*buf = sk->conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }

-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
-	return libpqwp_async_read(sk->conn, buf, amount);
-}
-
 static PGAsyncWriteResult
 walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
@@ -979,32 +962,6 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 	return true;
 }

-void
-libpqwp_disconnect(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-static void
-walprop_finish(Safekeeper *sk)
-{
-	if (sk->conn)
-	{
-		libpqwp_disconnect(sk->conn);
-		sk->conn = NULL;
-	}
-
-	/* free xlogreader */
-	if (sk->xlogreader)
-	{
-		NeonWALReaderFree(sk->xlogreader);
-		sk->xlogreader = NULL;
-	}
-}
-
 /*
 * Subscribe for new WAL and stream it in the loop to safekeepers.
 *
@@ -1429,41 +1386,26 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }

-static NeonWALReadResult
+static void
 walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
 {
-	NeonWALReadResult res;
+	WALReadError errinfo;

-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
-
-	if (res == NEON_WALREAD_SUCCESS)
+	if (!WALRead(sk->xlogreader,
+				 buf,
+				 startptr,
+				 count,
+				 walprop_pg_get_timeline_id(),
+				 &errinfo))
 	{
-		/*
-		 * If we have the socket subscribed, but walreader doesn't need any
-		 * events, it must mean that remote connection just closed hoping to
-		 * do next read locally. Remove the socket then. It is important to do
-		 * as otherwise next read might open another connection and we won't
-		 * be able to distinguish whether we have correct socket added in wait
-		 * event set.
-		 */
-		if (NeonWALReaderEvents(sk->xlogreader) == 0)
-			rm_safekeeper_event_set(sk, false);
+		WALReadRaiseError(&errinfo);
 	}
-
-	return res;
 }

 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	char		log_prefix[64];
-
-	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
+	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
 	if (sk->xlogreader == NULL)
 		elog(FATAL, "Failed to allocate xlog reader");
 }
@@ -1482,7 +1424,6 @@ walprop_pg_free_event_set(WalProposer *wp)
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
 	}
 }

@@ -1492,35 +1433,11 @@ walprop_pg_init_event_set(WalProposer *wp)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");

-	/* for each sk, we have socket plus potentially socket for neon walreader */
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
-	}
-}
-
-/* add safekeeper socket to wait event set */
-static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->eventPos == -1);
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
-}
-
-/* add neon wal reader socket to wait event set */
-static void
-add_nwr_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->nwrEventPos == -1);
-	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
-	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }

 static void
@@ -1532,139 +1449,10 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }

-/*
- * Update neon_walreader event.
- * Can be called when nwr socket doesn't exist, does nothing in this case.
- */
 static void
-update_nwr_event_set(Safekeeper *sk, uint32 events)
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
 {
-	/* eventPos = -1 when we don't have an event */
-	if (sk->nwrEventPos != -1)
-		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
-}
-
-
-static void
-walprop_pg_active_state_update_event_set(Safekeeper *sk)
-{
-	uint32		sk_events;
-	uint32		nwr_events;
-
-	Assert(sk->state == SS_ACTIVE);
-	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-	/*
-	 * If we need to wait for neon_walreader, ensure we have up to date socket
-	 * in the wait event set.
-	 */
-	if (sk->active_state == SS_ACTIVE_READ_WAL)
-	{
-		/*
-		 * TODO: instead of reattaching socket (and thus recreating WES) each
-		 * time we should keep it if possible, i.e. if connection is already
-		 * established. Note that single neon_walreader object can switch
-		 * between local and remote reads multiple times during its lifetime,
-		 * so careful bookkeeping is needed here.
-		 */
-		rm_safekeeper_event_set(sk, false);
-		add_nwr_event_set(sk, nwr_events);
-	}
-	else
-	{
-		/*
-		 * Hack: we should always set 0 here, but for random reasons
-		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
-		 * some event. Since there is also no way to remove socket except
-		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
-		 * gives WL_SOCKET_CLOSED if socket exists.
-		 */
-		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
-		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
-	}
-	walprop_pg_update_event_set(sk, sk_events);
-}
-
-static void
-walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
-{
-	rm_safekeeper_event_set(to_remove, true);
-}
-
-/*
- * A hacky way to remove single event from the event set. Can be called if event
- * doesn't exist, does nothing in this case.
- *
- * Note: Internally, this completely reconstructs the event set. It should be
- * avoided if possible.
- *
- * If is_sk is true, socket of connection to safekeeper is removed; otherwise
- * socket of neon_walreader.
- */
-static void
-rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
-{
-	WalProposer *wp = to_remove->wp;
-
-	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-		 to_remove->host, to_remove->port, is_sk);
-
-	/*
-	 * Shortpath for exiting if have nothing to do. We never call this
-	 * function with safekeeper socket not existing, but do that with neon
-	 * walreader socket.
-	 */
-	if ((is_sk && to_remove->eventPos == -1) ||
-		(!is_sk && to_remove->nwrEventPos == -1))
-	{
-		return;
-	}
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	walprop_pg_free_event_set(wp);
-
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk == to_remove)
-		{
-			if (is_sk)
-				sk->eventPos = -1;
-			else
-				sk->nwrEventPos = -1;
-		}
-
-		/*
-		 * If this safekeeper isn't offline, add events for it, except for the
-		 * event requested to remove.
-		 */
-		if (sk->state != SS_OFFLINE)
-		{
-			uint32		sk_events;
-			uint32		nwr_events;
-
-			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-			if (sk != to_remove || !is_sk)
-			{
-				/* will set sk->eventPos */
-				wp->api.add_safekeeper_event_set(sk, sk_events);
-			}
-			else if ((sk != to_remove || is_sk) && nwr_events)
-			{
-				add_nwr_event_set(sk, nwr_events);
-			}
-		}
-	}
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
 }

 static int
@@ -1880,17 +1668,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 static void
 walprop_pg_after_election(WalProposer *wp)
 {
-	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
+	FILE* f;
+	XLogRecPtr lrRestartLsn;

-	/* We don't need to do anything in syncSafekeepers mode. */
+	/* We don't need to do anything in syncSafekeepers mode.*/
 	if (wp->config->syncSafekeepers)
 		return;

 	/*
-	 * If there are active logical replication subscription we need to provide
-	 * enough WAL for their WAL senders based on th position of their
-	 * replication slots.
+	 * If there are active logical replication subscription we need
+	 * to provide enough WAL for their WAL senders based on th position
+	 * of their replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
 	if (f != NULL && !wp->config->syncSafekeepers)
@@ -1899,12 +1687,8 @@ walprop_pg_after_election(WalProposer *wp)
 		fclose(f);
 		if (lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
+			elog(LOG, "Logical replication restart LSN %X/%X",  LSN_FORMAT_ARGS(lrRestartLsn));
+			/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
 			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}
@@ -1930,11 +1714,10 @@ static const walproposer_api walprop_pg = {
 	.recovery_download = WalProposerRecovery,
 	.wal_read = walprop_pg_wal_read,
 	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
+	.free_event_set = walprop_pg_free_event_set,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
-	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
-	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
 	.strong_random = walprop_pg_strong_random,
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -6,6 +6,7 @@ pub use link::LinkAuthError;

 use crate::{
    auth::{self, ClientCredentials},
+    config::AuthenticationConfig,
    console::{
        self,
        provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -124,6 +125,7 @@ async fn auth_quirks(
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
@@ -145,7 +147,7 @@ async fn auth_quirks(
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client).await
+    classic::authenticate(api, extra, creds, client, config).await
 }

 impl BackendType<'_, ClientCredentials<'_>> {
@@ -180,6 +182,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
        extra: &ConsoleReqExtra<'_>,
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
        allow_cleartext: bool,
+        config: &'static AuthenticationConfig,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

@@ -192,7 +195,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext).await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            Postgres(api, creds) => {
                info!(
@@ -202,7 +205,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(api, extra, creds, client, allow_cleartext).await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,6 +4,7 @@ use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
+    config::AuthenticationConfig,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
    proxy::{handle_try_wake, retry_after},
    sasl, scram,
@@ -17,6 +18,7 @@ pub(super) async fn authenticate(
    extra: &ConsoleReqExtra<'_>,
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
@@ -42,7 +44,16 @@ pub(super) async fn authenticate(
                error
            })?;

-            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
+            let auth_outcome = tokio::time::timeout(
+                config.scram_protocol_timeout,
+                auth_flow.authenticate(),
+            )
+            .await
+            .map_err(|error| {
+                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
+                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
+            })?
+            .map_err(|error| {
                warn!(?error, "error processing scram messages");
                error
            })?;
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
@@ -83,7 +84,9 @@ struct ProxyCliArgs {
    /// timeout for http connections
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    sql_over_http_timeout: tokio::time::Duration,
-
+    /// timeout for scram authentication protocol
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    scram_protocol_timeout: tokio::time::Duration,
    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
@@ -231,12 +234,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    let http_config = HttpConfig {
        sql_over_http_timeout: args.sql_over_http_timeout,
    };
+    let authentication_config = AuthenticationConfig {
+        scram_protocol_timeout: args.scram_protocol_timeout,
+    };
    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
+        authentication_config,
        require_client_ip: args.require_client_ip,
    }));

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,5 +1,5 @@
-use anyhow::{anyhow, Context};
-use hashbrown::HashMap;
+use anyhow::{bail, Context};
+use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::net::SocketAddr;
 use tokio::net::TcpStream;
@@ -8,7 +8,7 @@ use tracing::info;

 /// Enables serving `CancelRequest`s.
 #[derive(Default)]
-pub struct CancelMap(parking_lot::RwLock<HashMap<CancelKeyData, Option<CancelClosure>>>);
+pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);

 impl CancelMap {
    /// Cancel a running query for the corresponding connection.
@@ -16,7 +16,6 @@ impl CancelMap {
        // NB: we should immediately release the lock after cloning the token.
        let cancel_closure = self
            .0
-            .read()
            .get(&key)
            .and_then(|x| x.clone())
            .with_context(|| format!("query cancellation key not found: {key}"))?;
@@ -40,15 +39,19 @@ impl CancelMap {

        // Random key collisions are unlikely to happen here, but they're still possible,
        // which is why we have to take care not to rewrite an existing key.
-        self.0
-            .write()
-            .try_insert(key, None)
-            .map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;
+        match self.0.entry(key) {
+            dashmap::mapref::entry::Entry::Occupied(_) => {
+                bail!("query cancellation key already exists: {key}")
+            }
+            dashmap::mapref::entry::Entry::Vacant(e) => {
+                e.insert(None);
+            }
+        }

        // This will guarantee that the session gets dropped
        // as soon as the future is finished.
        scopeguard::defer! {
-            self.0.write().remove(&key);
+            self.0.remove(&key);
            info!("dropped query cancellation key {key}");
        }

@@ -59,12 +62,12 @@ impl CancelMap {

    #[cfg(test)]
    fn contains(&self, session: &Session) -> bool {
-        self.0.read().contains_key(&session.key)
+        self.0.contains_key(&session.key)
    }

    #[cfg(test)]
    fn is_empty(&self) -> bool {
-        self.0.read().is_empty()
+        self.0.is_empty()
    }
 }

@@ -113,10 +116,7 @@ impl Session<'_> {
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
-        self.cancel_map
-            .0
-            .write()
-            .insert(self.key, Some(cancel_closure));
+        self.cancel_map.0.insert(self.key, Some(cancel_closure));

        self.key
    }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -14,6 +14,7 @@ pub struct ProxyConfig {
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
+    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
 }

@@ -32,6 +33,10 @@ pub struct HttpConfig {
    pub sql_over_http_timeout: tokio::time::Duration,
 }

+pub struct AuthenticationConfig {
+    pub scram_protocol_timeout: tokio::time::Duration,
+}
+
 impl TlsConfig {
    pub fn to_server_config(&self) -> Arc<rustls::ServerConfig> {
        self.config.clone()
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -194,9 +194,10 @@ impl GlobalConnPool {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
            } else {
-                latency_timer.pool_hit();
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
+                latency_timer.pool_hit();
+                latency_timer.success();
                return Ok(Client {
                    inner: Some(client),
                    span: Span::current(),
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -5,7 +5,7 @@ use crate::{
    auth::{self, backend::AuthSuccess},
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
-    config::{ProxyConfig, TlsConfig},
+    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    http::StatusCode,
    metrics::{Ids, USAGE_METRICS},
@@ -96,7 +96,9 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "proxy_compute_connection_latency_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        &["protocol", "cache_miss", "pool_miss"],
+        // http/ws/tcp, true/false, true/false, success/failure
+        // 3 * 2 * 2 * 2 = 24 counters
+        &["protocol", "cache_miss", "pool_miss", "outcome"],
        // largest bucket = 2^16 * 0.5ms = 32s
        exponential_buckets(0.0005, 2.0, 16).unwrap(),
    )
@@ -105,19 +107,22 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {

 pub struct LatencyTimer {
    start: Instant,
-    pool_miss: bool,
-    cache_miss: bool,
    protocol: &'static str,
+    cache_miss: bool,
+    pool_miss: bool,
+    outcome: &'static str,
 }

 impl LatencyTimer {
    pub fn new(protocol: &'static str) -> Self {
        Self {
            start: Instant::now(),
+            protocol,
            cache_miss: false,
            // by default we don't do pooling
            pool_miss: true,
-            protocol,
+            // assume failed unless otherwise specified
+            outcome: "failed",
        }
    }

@@ -128,6 +133,10 @@ impl LatencyTimer {
    pub fn pool_hit(&mut self) {
        self.pool_miss = false;
    }
+
+    pub fn success(mut self) {
+        self.outcome = "success";
+    }
 }

 impl Drop for LatencyTimer {
@@ -138,6 +147,7 @@ impl Drop for LatencyTimer {
                self.protocol,
                bool_to_str(self.cache_miss),
                bool_to_str(self.pool_miss),
+                self.outcome,
            ])
            .observe(duration)
    }
@@ -340,7 +350,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        mode.allow_self_signed_compute(config),
    );
    cancel_map
-        .with_session(|session| client.connect_to_db(session, mode))
+        .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
        .await
 }

@@ -547,7 +557,10 @@ where

    // try once
    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => return Ok(res),
+        Ok(res) => {
+            latency_timer.success();
+            return Ok(res);
+        }
        Err(e) => {
            error!(error = ?e, "could not connect to compute node");
            (invalidate_cache(node_info), e)
@@ -601,7 +614,10 @@ where
    info!("wake_compute success. attempting to connect");
    loop {
        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => return Ok(res),
+            Ok(res) => {
+                latency_timer.success();
+                return Ok(res);
+            }
            Err(e) => {
                let retriable = e.should_retry(num_retries);
                if !retriable {
@@ -818,6 +834,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        self,
        session: cancellation::Session<'_>,
        mode: ClientMode,
+        config: &'static AuthenticationConfig,
    ) -> anyhow::Result<()> {
        let Self {
            mut stream,
@@ -835,7 +852,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        let latency_timer = LatencyTimer::new(mode.protocol_label());

        let auth_result = match creds
-            .authenticate(&extra, &mut stream, mode.allow_cleartext())
+            .authenticate(&extra, &mut stream, mode.allow_cleartext(), config)
            .await
        {
            Ok(auth_result) => auth_result,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -357,12 +357,6 @@ class PgProtocol:
                        result.append(cur.fetchall())
        return result

-    def safe_psql_scalar(self, query) -> Any:
-        """
-        Execute query returning single row with single column.
-        """
-        return self.safe_psql(query)[0][0]
-

@dataclass
 class AuthKeys:
@@ -1637,7 +1631,7 @@ class NeonPageserver(PgProtocol):
            ".*took more than expected to complete.*",
            # these can happen during shutdown, but it should not be a reason to fail a test
            ".*completed, took longer than expected.*",
-            '.*registered custom resource manager "neon".*',
+            '.*registered custom resource manager \\\\"neon\\\\".*',
            # AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
            # and it is not a failure of our code when it happens.
            ".*DeleteObjects.*We encountered an internal error. Please try again.*",
@@ -2583,13 +2577,6 @@ class Endpoint(PgProtocol):
    ):
        self.stop()

-    # Checkpoints running endpoint and returns pg_wal size in MB.
-    def get_pg_wal_size(self):
-        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
-        self.safe_psql("checkpoint")
-        assert self.pgdata_dir is not None  # please mypy
-        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
-

 class EndpointFactory:
    """An object representing multiple compute endpoints."""
@@ -2786,13 +2773,6 @@ class Safekeeper:
        return segments


-# Walreceiver as returned by sk's timeline status endpoint.
-@dataclass
-class Walreceiver:
-    conn_id: int
-    state: str
-
-
@dataclass
 class SafekeeperTimelineStatus:
    acceptor_epoch: int
@@ -2803,7 +2783,6 @@ class SafekeeperTimelineStatus:
    backup_lsn: Lsn
    peer_horizon_lsn: Lsn
    remote_consistent_lsn: Lsn
-    walreceivers: List[Walreceiver]


@dataclass
@@ -2865,7 +2844,6 @@ class SafekeeperHttpClient(requests.Session):
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
        res.raise_for_status()
        resj = res.json()
-        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
        return SafekeeperTimelineStatus(
            acceptor_epoch=resj["acceptor_state"]["epoch"],
            pg_version=resj["pg_info"]["pg_version"],
@@ -2875,7 +2853,6 @@ class SafekeeperHttpClient(requests.Session):
            backup_lsn=Lsn(resj["backup_lsn"]),
            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
-            walreceivers=walreceivers,
        )

    def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -404,8 +404,7 @@ def wait(f, desc, timeout=30, wait_f=None):
        try:
            if f():
                break
-        except Exception as e:
-            log.info(f"got exception while waiting for {desc}: {e}")
+        except Exception:
            pass
        elapsed = time.time() - started_at
        if elapsed > timeout:
@@ -989,40 +988,8 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
        endpoint.start()


-# Context manager which logs passed time on exit.
-class DurationLogger:
-    def __init__(self, desc):
-        self.desc = desc
-
-    def __enter__(self):
-        self.ts_before = time.time()
-
-    def __exit__(self, *exc):
-        log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
-
-
-# Context manager which logs WAL position change on exit.
-class WalChangeLogger:
-    def __init__(self, ep, desc_before):
-        self.ep = ep
-        self.desc_before = desc_before
-
-    def __enter__(self):
-        self.ts_before = time.time()
-        self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
-        log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
-
-    def __exit__(self, *exc):
-        lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
-        log.info(
-            f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
-        )
-
-
 # Test that we can create timeline with one safekeeper down and initialize it
-# later when some data already had been written. It is strictly weaker than
-# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
-# download (recovery) and as such useful for development/testing.
+# later when some data already had been written.
 def test_late_init(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()
@@ -1030,13 +997,12 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
    sk1 = env.safekeepers[0]
    sk1.stop()

-    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_late_init")
-    endpoint = env.endpoints.create_start("test_late_init")
    # create and insert smth while safekeeper is down...
+    env.neon_cli.create_branch("test_late_init")
+    endpoint = env.endpoints.create_start("test_late_init")
    endpoint.safe_psql("create table t(key int, value text)")
-    with WalChangeLogger(endpoint, "doing insert with sk1 down"):
-        endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    log.info("insert with safekeeper down done")
    endpoint.stop()  # stop compute

    # stop another safekeeper, and start one which missed timeline creation
@@ -1045,213 +1011,28 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
    sk1.start()

    # insert some more
-    with DurationLogger("recovery"):
-        endpoint = env.endpoints.create_start("test_late_init")
+    endpoint = env.endpoints.create_start("test_late_init")
    endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")

-    wait_flush_lsn_align_by_ep(
-        env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
-    )
-    # Check that WALs are the same.
-    cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
-

 # is timeline flush_lsn equal on provided safekeepers?
-def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
-    flush_lsns = [
-        sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
-        for sk_http_cli in sk_http_clis
-    ]
-    log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
-    return all([flush_lsns[0] == flsn for flsn in flush_lsns])
-
-
-def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
-    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
-    return len(status.walreceivers) == 0
-
-
-# Assert by xxd that WAL on given safekeepers is identical. No compute must be
-# running for this to be reliable.
-def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
-    assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
-    sk_http_clis = [sk.http_client() for sk in sks]
-
-    # First check that term / flush_lsn are the same: it is easier to
-    # report/understand if WALs are different due to that.
-    statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
-    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
-    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
-        assert (
-            term_flush_lsns[0] == tfl
-        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
-
-    # check that WALs are identic.
-    segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
-    for cmp_segs, sk in zip(segs[1:], sks[1:]):
-        assert (
-            segs[0] == cmp_segs
-        ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
-    log.info(f"comparing segs {segs[0]}")
-
-    sk0 = sks[0]
-    for sk in sks[1:]:
-        (_, mismatch, not_regular) = filecmp.cmpfiles(
-            sk0.timeline_dir(tenant_id, timeline_id),
-            sk.timeline_dir(tenant_id, timeline_id),
-            segs[0],
-            shallow=False,
-        )
-        log.info(
-            f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
-        )
-
-        for f in mismatch:
-            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
-            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
-            stdout_filename = "{}.filediff".format(f2)
-
-            with open(stdout_filename, "w") as stdout_f:
-                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
-
-                cmd = "diff {}.hex {}.hex".format(f1, f2)
-                subprocess.run([cmd], stdout=stdout_f, shell=True)
-
-            assert (mismatch, not_regular) == (
-                [],
-                [],
-            ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
-
-
-# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
-# running. ep is stopped by this function. This is used in tests which check
-# binary equality of WAL segments on safekeepers; which is inherently racy as
-# shutting down endpoint might always write some WAL which can get to only one
-# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
-# it has changed.
-def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
-    sk_http_clis = [sk.http_client() for sk in sks]
-    # First wait for the alignment.
-    wait(
-        partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
-        "flush_lsn to get aligned",
+def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
+    status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
+    status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(
+        f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
    )
-    ep.stop()  # then stop endpoint
-    # Even if there is no compute, there might be some in flight data; ensure
-    # all walreceivers die before rechecking.
-    for sk_http_cli in sk_http_clis:
-        wait(
-            partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
-            "walreceivers to be gone",
-        )
-    # Now recheck again flush_lsn and exit if it is good
-    if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
-        return
-    # Otherwise repeat.
-    log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
-    ep = env.endpoints.create_start(branch)
+    return status1.flush_lsn == status2.flush_lsn


-# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
-# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
-# compute doesn't keep many WAL for lagging sk, but still can recover it with
-# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
-# on basebackup LSN later than lagging sk position) though segment file exists
-# b) WAL had been recycled on it and segment file doesn't exist.
-#
-# Also checks along the way that whenever there are two sks alive, compute
-# should be able to commit.
-def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
-    # inserts ~20MB of WAL, a bit more than a segment.
-    def fill_segment(ep):
-        ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
-
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    (sk1, sk2, sk3) = env.safekeepers
-
-    # create and insert smth while safekeeper is down...
-    sk1.stop()
-    tenant_id = env.initial_tenant
-    timeline_id = env.neon_cli.create_branch("test_lagging_sk")
-    ep = env.endpoints.create_start("test_lagging_sk")
-    ep.safe_psql("create table t(key int, value text)")
-    # make small insert to be on the same segment
-    ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
-    log.info("insert with safekeeper down done")
-    ep.stop()  # stop compute
-
-    # Stop another safekeeper, and start one which missed timeline creation.
-    sk2.stop()
-    sk1.start()
-
-    # Start new ep and insert some more. neon_walreader should download WAL for
-    # sk1 because it should be filled since the horizon (initial LSN) which is
-    # earlier than basebackup LSN.
-    ep = env.endpoints.create_start("test_lagging_sk")
-    ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
-    # stop ep and ensure WAL is identical after recovery.
-    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
-    # Check that WALs are the same.
-    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
-
-    # Now repeat insertion with sk1 down, but with inserting more data to check
-    # that WAL on compute is removed.
-    sk1.stop()
-    sk2.start()
-
-    # min_wal_size must be at least 2x segment size.
-    min_wal_config = [
-        "min_wal_size=32MB",
-        "max_wal_size=32MB",
-        "wal_keep_size=0",
-        "log_checkpoints=on",
-    ]
-    ep = env.endpoints.create_start(
-        "test_lagging_sk",
-        config_lines=min_wal_config,
-    )
-    with WalChangeLogger(ep, "doing large insert with sk1 down"):
-        for _ in range(0, 5):
-            fill_segment(ep)
-    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
-    assert ep.get_pg_wal_size() < 16 * 2.5
-
-    sk2.stop()  # stop another sk to ensure sk1 and sk3 can work
-    sk1.start()
-    with DurationLogger("recovery"):
-        ep.safe_psql("insert into t select generate_series(1,100), 'payload'")  # forces recovery
-    # stop ep and ensure WAL is identical after recovery.
-    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
-    # Check that WALs are the same.
-    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
-
-    # Now do the same with different safekeeper sk2 down, and restarting ep
-    # before recovery (again scenario when recovery starts below basebackup_lsn,
-    # but multi segment now).
-    ep = env.endpoints.create_start(
-        "test_lagging_sk",
-        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
-    )
-    with WalChangeLogger(ep, "doing large insert with sk2 down"):
-        for _ in range(0, 5):
-            fill_segment(ep)
-    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
-    assert ep.get_pg_wal_size() < 16 * 2.5
-
-    ep.stop()
-    ep = env.endpoints.create_start(
-        "test_lagging_sk",
-        config_lines=min_wal_config,
-    )
-    sk2.start()
-    with DurationLogger("recovery"):
-        wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
-    # Check that WALs are the same.
-    cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
+# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
+# 1) walproposer can't recover node if it misses WAL written by previous computes, but
+#    still starts up and functions normally if two other sks are ok.
+# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
+#    normally if two other sks are ok.
+# 3) Lagged safekeeper can still recover by peer recovery.
+def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
+    pass


 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
@@ -1293,7 +1074,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
    assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024

    # wait a bit, lsns shouldn't change
-    time.sleep(2)
+    # time.sleep(5)
    sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
    sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
    log.info(
@@ -1304,11 +1085,37 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
    # now restart safekeeper with peer recovery enabled and wait for recovery
    sk1.stop().start(extra_opts=["--peer-recovery=true"])
    wait(
-        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
        "flush_lsn to get aligned",
    )

-    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
+    # check that WALs are identic after recovery
+    segs = sk1.list_segments(tenant_id, timeline_id)
+    log.info(f"segs are {segs}")
+
+    (_, mismatch, not_regular) = filecmp.cmpfiles(
+        sk1.timeline_dir(tenant_id, timeline_id),
+        sk2.timeline_dir(tenant_id, timeline_id),
+        segs,
+        shallow=False,
+    )
+    log.info(
+        f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
+    )
+
+    for f in mismatch:
+        f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
+        f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
+        stdout_filename = "{}.filediff".format(f2)
+
+        with open(stdout_filename, "w") as stdout_f:
+            subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
+            subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+
+            cmd = "diff {}.hex {}.hex".format(f1, f2)
+            subprocess.run([cmd], stdout=stdout_f, shell=True)
+
+    assert (mismatch, not_regular) == ([], [])

    # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
    env.safekeepers[2].stop()