WIP

2026-07-03 04:00:37 +00:00 · 2023-10-05 18:21:18 +02:00 · 2023-10-05 18:13:54 +02:00 · 2023-10-05 18:06:26 +02:00 · 2023-10-05 18:02:22 +02:00 · 2023-10-05 16:54:02 +02:00
32 changed files with 2719 additions and 3106 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1092,10 +1092,8 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-
-            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -158,6 +158,17 @@ dependencies = [
 "syn 1.0.109",
 ]

+[[package]]
+name = "async-channel"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
+dependencies = [
+ "concurrent-queue",
+ "event-listener",
+ "futures-core",
+]
+
 [[package]]
 name = "async-compression"
 version = "0.4.0"
@@ -1031,6 +1042,15 @@ dependencies = [
 "zstd",
 ]

+[[package]]
+name = "concurrent-queue"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1452,6 +1472,12 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "event-listener"
+version = "2.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+
 [[package]]
 name = "fail"
 version = "0.5.1"
@@ -2674,6 +2700,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-channel",
 "async-compression",
 "async-stream",
 "async-trait",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+async-channel = "1.9.0"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use tracing::{error, info, info_span};
+use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;
@@ -57,7 +57,6 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use utils::id::TenantTimelineId;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -250,20 +249,11 @@ fn main() -> Result<()> {

    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
-    let pspec = state.pspec.as_ref().expect("spec must be set");
-    let ttid = TenantTimelineId {
-        tenant_id: pspec.tenant_id,
-        timeline_id: pspec.timeline_id,
-    };
    drop(state);

-    // Log ttid everywhere for easier log identification (e.g. loki agent can
-    // create label on that).
-    let _guard = info_span!("", ttid = %ttid).entered();
-
    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute, ttid);
-    let _configurator_handle = launch_configurator(&compute, ttid);
+    let _monitor_handle = launch_monitor(&compute);
+    let _configurator_handle = launch_configurator(&compute);

    // Start Postgres
    let mut delay_exit = false;
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -4,13 +4,11 @@ use std::thread;
 use tracing::{error, info, instrument};

 use compute_api::responses::ComputeStatus;
-use utils::id::TenantTimelineId;

 use crate::compute::ComputeNode;

-// Log ttid everywhere
-#[instrument(name = "", fields(ttid = %ttid), skip_all)]
-fn configurator_main_loop(compute: &Arc<ComputeNode>, ttid: TenantTimelineId) {
+#[instrument(skip_all)]
+fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
        let state = compute.state.lock().unwrap();
@@ -43,16 +41,13 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>, ttid: TenantTimelineId) {
    }
 }

-pub fn launch_configurator(
-    compute: &Arc<ComputeNode>,
-    ttid: TenantTimelineId,
-) -> thread::JoinHandle<()> {
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);

    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
-            configurator_main_loop(&compute, ttid);
+            configurator_main_loop(&compute);
            info!("configurator thread is exited");
        })
        .expect("cannot launch configurator thread")
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,8 +3,7 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info, instrument};
-use utils::id::TenantTimelineId;
+use tracing::{debug, info};

 use crate::compute::ComputeNode;

@@ -13,8 +12,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-#[instrument(name = "", fields(ttid = %ttid), skip_all)]
-fn watch_compute_activity(compute: &ComputeNode, ttid: TenantTimelineId) {
+fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
@@ -105,11 +103,11 @@ fn watch_compute_activity(compute: &ComputeNode, ttid: TenantTimelineId) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>, ttid: TenantTimelineId) -> thread::JoinHandle<()> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let state = Arc::clone(state);

    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state, ttid))
+        .spawn(move || watch_compute_activity(&state))
        .expect("cannot launch compute monitor thread")
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
+async-channel.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -86,15 +86,18 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.

+use std::sync::{Arc, Mutex, MutexGuard};
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
+    page_cache_permit: Option<Arc<crate::page_cache::PinnedSlotsPermit>>,
 }

 /// The kind of access to the page cache.
@@ -150,6 +153,7 @@ impl RequestContextBuilder {
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
+                page_cache_permit: None,
            },
        }
    }
@@ -163,6 +167,7 @@ impl RequestContextBuilder {
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
+                page_cache_permit: original.page_cache_permit.clone(),
            },
        }
    }
@@ -186,6 +191,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn page_cache_permit(mut self, p: Arc<crate::page_cache::PinnedSlotsPermit>) -> Self {
+        self.inner.page_cache_permit = Some(p);
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -286,4 +296,8 @@ impl RequestContext {
    pub(crate) fn page_content_kind(&self) -> PageContentKind {
        self.page_content_kind
    }
+
+    pub(crate) fn permit(&self) -> Option<&crate::page_cache::PinnedSlotsPermit> {
+        self.page_cache_permit.as_ref().map(|p| &**p)
+    }
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -133,8 +133,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

-        fail::fail_point!("control-plane-client-re-attach");
-
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -170,8 +168,6 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

-        fail::fail_point!("control-plane-client-validate");
-
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

        Ok(response
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -220,8 +220,6 @@ where
                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } else {
-                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
                }
                this_list_valid
            });
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -314,7 +314,6 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
-    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -967,7 +966,6 @@ pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
    pub(crate) keys_executed: IntCounter,
-    pub(crate) keys_validated: IntCounter,
    pub(crate) dropped_lsn_updates: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
    pub(crate) remote_errors: IntCounterVec,
@@ -989,13 +987,7 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {

    keys_executed: register_int_counter!(
        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
-    )
-    .expect("failed to define a metric"),
-
-    keys_validated: register_int_counter!(
-        "pageserver_deletion_queue_validated_total",
-        "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
    )
    .expect("failed to define a metric"),

--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,8 +66,7 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -79,6 +78,7 @@ use std::{
        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        Arc, Weak,
    },
+    task::Poll,
    time::Duration,
 };

@@ -215,16 +215,21 @@ impl Slot {

 impl SlotInner {
    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
-    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
-        let mut guard = self.permit.lock().unwrap();
-        if let Some(existing_permit) = guard.upgrade() {
-            drop(guard);
-            drop(permit);
-            existing_permit
-        } else {
-            let permit = Arc::new(permit);
-            *guard = Arc::downgrade(&permit);
-            permit
+    fn coalesce_readers_permit<'c>(&self, permit: PermitKind<'c>) -> PermitKindReadGuard<'c> {
+        match permit {
+            PermitKind::CtxProvided(permit) => PermitKindReadGuard::CtxProvided(permit),
+            PermitKind::Acquired(permit) => {
+                let mut guard = self.permit.lock().unwrap();
+                if let Some(existing_permit) = guard.upgrade() {
+                    drop(guard);
+                    drop(permit);
+                    existing_permit
+                } else {
+                    let permit = Arc::new(permit);
+                    *guard = Arc::downgrade(&permit);
+                    permit
+                }
+            }
        }
    }
 }
@@ -252,21 +257,36 @@ pub struct PageCache {
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,

+    find_victim_sender:
+        async_channel::Sender<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
+    find_victim_waiters:
+        async_channel::Receiver<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
+
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+pub(crate) struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
+enum PermitKind<'c> {
+    CtxProvided(&'c PinnedSlotsPermit),
+    Acquired(PinnedSlotsPermit),
+}
+
+enum PermitKindReadGuard<'c> {
+    CtxProvided(&'c PinnedSlotsPermit),
+    Coalesced(Arc<PinnedSlotsPermit>),
+}

 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i> {
-    _permit: Arc<PinnedSlotsPermit>,
+pub struct PageReadGuard<'c, 'i> {
+    _permit: PermitKindReadGuard<'c>,
    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
 }

-impl std::ops::Deref for PageReadGuard<'_> {
+impl std::ops::Deref for PageReadGuard<'_, '_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
@@ -274,7 +294,7 @@ impl std::ops::Deref for PageReadGuard<'_> {
    }
 }

-impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
+impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_, '_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
        self.slot_guard.buf
    }
@@ -286,78 +306,89 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
-pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
-
-    _permit: PinnedSlotsPermit,
-
-    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
-    valid: bool,
+/// is expected to fill in the page contents and call mark_valid().
+pub struct PageWriteGuard<'c, 'i> {
+    state: PageWriteGuardState<'c, 'i>,
 }

-impl std::ops::DerefMut for PageWriteGuard<'_> {
+enum PageWriteGuardState<'c, 'i> {
+    Invalid {
+        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+        _permit: PermitKindReadGuard<'c>,
+    },
+    Downgraded,
+}
+
+impl std::ops::DerefMut for PageWriteGuard<'_, '_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

-impl std::ops::Deref for PageWriteGuard<'_> {
+impl std::ops::Deref for PageWriteGuard<'_, '_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.inner.buf
+        match &self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

-impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
+impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_, '_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => todo!(),
+        }
    }
 }

-impl PageWriteGuard<'_> {
+impl<'c, 'a> PageWriteGuard<'c, 'a> {
    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
+    #[must_use]
+    pub fn mark_valid(mut self) -> PageReadGuard<'c, 'a> {
+        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
+        match prev {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                PageReadGuard {
+                    _permit,
+                    slot_guard: inner.downgrade(),
+                }
+            }
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

-impl Drop for PageWriteGuard<'_> {
+impl Drop for PageWriteGuard<'_, '_> {
    ///
    /// If the buffer was allocated for a page that was not already in the
    /// cache, but the lock_for_read/write() caller dropped the buffer without
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                let self_key = inner.key.as_ref().unwrap();
+                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+                inner.key = None;
+            }
+            PageWriteGuardState::Downgraded => {}
        }
    }
 }

 /// lock_for_read() return value
-pub enum ReadBufResult<'a> {
-    Found(PageReadGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
+pub enum ReadBufResult<'c, 'a> {
+    Found(PageReadGuard<'c, 'a>),
+    NotFound(PageWriteGuard<'c, 'a>),
 }

 impl PageCache {
@@ -379,10 +410,9 @@ impl PageCache {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
-        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+        let Ok(permit) = self.try_get_pinned_slot_permit(ctx).await else {
            return None;
        };
-
        crate::metrics::PAGE_CACHE
            .for_ctx(ctx)
            .read_accesses_materialized_page
@@ -430,12 +460,13 @@ impl PageCache {
    /// Store an image of the given page in the cache.
    ///
    pub async fn memorize_materialized_page(
-        &self,
+        &'static self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
        lsn: Lsn,
        img: &[u8],
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
@@ -446,30 +477,87 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
+        let mut permit = Some(self.try_get_pinned_slot_permit(ctx).await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
+            debug_assert!(permit.is_some());

-        Ok(())
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
    }

    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub async fn read_immutable_buf(
-        &self,
+    pub async fn read_immutable_buf<'c>(
+        &'static self,
        file_id: FileId,
        blkno: u32,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<ReadBufResult> {
+        ctx: &'c RequestContext,
+    ) -> anyhow::Result<ReadBufResult<'c, 'static>> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key, ctx).await
@@ -483,7 +571,22 @@ impl PageCache {
    // "mappings" after this section. But the routines in this section should
    // not require changes.

-    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+    pub(crate) async fn get_permit(&self) -> Arc<PinnedSlotsPermit> {
+        Arc::new(PinnedSlotsPermit(
+            Arc::clone(&self.pinned_slots)
+                .acquire_owned()
+                .await
+                .expect("the semaphore is never closed"),
+        ))
+    }
+
+    async fn try_get_pinned_slot_permit<'c>(
+        &self,
+        ctx: &'c RequestContext,
+    ) -> anyhow::Result<PermitKind<'c>> {
+        if let Some(permit) = ctx.permit() {
+            return Ok(PermitKind::CtxProvided(permit));
+        };
        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
        match tokio::time::timeout(
            // Choose small timeout, neon_smgr does its own retries.
@@ -493,9 +596,9 @@ impl PageCache {
        )
        .await
        {
-            Ok(res) => Ok(PinnedSlotsPermit(
+            Ok(res) => Ok(PermitKind::Acquired(PinnedSlotsPermit(
                res.expect("this semaphore is never closed"),
-            )),
+            ))),
            Err(_timeout) => {
                timer.stop_and_discard();
                crate::metrics::page_cache_errors_inc(
@@ -515,10 +618,10 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(
+    async fn try_lock_for_read<'c>(
        &self,
        cache_key: &mut CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
+        permit: &mut Option<PermitKind<'c>>,
    ) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
@@ -571,11 +674,11 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &self,
+        &'static self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        let mut permit = Some(self.try_get_pinned_slot_permit(ctx).await?);

        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
@@ -638,99 +741,10 @@ impl PageCache {
            );

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(
-        &self,
-        cache_key: &CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                debug_assert!(
-                    {
-                        let guard = inner.permit.lock().unwrap();
-                        guard.upgrade().is_none()
-                    },
-                    "we hold a write lock, so, no one else should have a permit"
-                );
-                return Some(PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
                    _permit: permit.take().unwrap(),
                    inner,
-                    valid: true,
-                });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
-                return Ok(WriteBufResult::Found(write_guard));
-            }
-            debug_assert!(permit.is_some());
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-
-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
-            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
-                inner,
-                valid: false,
            }));
        }
    }
@@ -775,7 +789,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -882,10 +896,12 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &self,
+        &'static self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let iter_limit = self.slots.len() * 10;
+        // Get in line.
+        let receiver = self.find_victim_waiters.recv();
+
        let mut iters = 0;
        loop {
            iters += 1;
@@ -897,41 +913,8 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > iter_limit {
-                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
-                            // any particular number of iterations: other threads might race ahead and acquire and
-                            // release pins just as we're scanning the array.
-                            //
-                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
-                            // slots. There are two threads running concurrently, A and B. A has just
-                            // acquired the permit from the semaphore.
-                            //
-                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
-                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //
-                            // Now we're back in the starting situation that both slots have
-                            // usage_count 1, but A has now been through one iteration of the
-                            // find_victim() loop. This can repeat indefinitely and on each
-                            // iteration, A's iteration count increases by one.
-                            //
-                            // So, even though the semaphore for the permits is fair, the victim search
-                            // itself happens in parallel and is not fair.
-                            // Hence even with a permit, a task can theoretically be starved.
-                            // To avoid this, we'd need tokio to give priority to tasks that are holding
-                            // permits for longer.
-                            // Note that just yielding to tokio during iteration without such
-                            // priority boosting is likely counter-productive. We'd just give more opportunities
-                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
-                            );
-                            anyhow::bail!("exceeded evict iter limit");
+                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
+                            unreachable!("find_victim_waiters prevents starvation");
                        }
                        continue;
                    }
@@ -942,7 +925,16 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-                return Ok((slot_idx, inner));
+                self.find_victim_sender
+                    .try_send((slot_idx, inner))
+                    .expect("we always get in line first");
+                match futures::poll!(receiver) {
+                    Poll::Ready(Ok(res)) => return Ok(res),
+                    Poll::Ready(Err(_closed)) => unreachable!("we never close"),
+                    Poll::Pending => {
+                        unreachable!("we just sent to the channel and got in line earlier")
+                    }
+                }
            }
        }
    }
@@ -979,6 +971,7 @@ impl PageCache {
            })
            .collect();

+        let (find_victim_sender, find_victim_waiters) = async_channel::bounded(num_pages);
        Self {
            materialized_page_map: Default::default(),
            immutable_page_map: Default::default(),
@@ -986,6 +979,8 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+            find_victim_sender,
+            find_victim_waiters,
        }
    }
 }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -20,10 +20,10 @@ use std::io::{Error, ErrorKind};

 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
-    pub async fn read_blob(
+    pub async fn read_blob<'c>(
        &self,
        offset: u64,
-        ctx: &RequestContext,
+        ctx: &'c RequestContext,
    ) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf, ctx).await?;
@@ -31,11 +31,11 @@ impl<'a> BlockCursor<'a> {
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub async fn read_blob_into_buf(
+    pub async fn read_blob_into_buf<'c>(
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
-        ctx: &RequestContext,
+        ctx: &'c RequestContext,
    ) -> Result<(), std::io::Error> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -34,27 +34,27 @@ where
 }

 /// Reference to an in-memory copy of an immutable on-disk block.
-pub enum BlockLease<'a> {
-    PageReadGuard(PageReadGuard<'static>),
+pub enum BlockLease<'c, 'a> {
+    PageReadGuard(PageReadGuard<'c, 'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
 }

-impl From<PageReadGuard<'static>> for BlockLease<'static> {
-    fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
+impl<'c, 'a> From<PageReadGuard<'c, 'a>> for BlockLease<'c, 'a> {
+    fn from(value: PageReadGuard<'c, 'a>) -> BlockLease<'c, 'a> {
        BlockLease::PageReadGuard(value)
    }
 }

 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+impl<'c, 'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'c, 'a> {
    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
        BlockLease::Arc(value)
    }
 }

-impl<'a> Deref for BlockLease<'a> {
+impl<'c, 'a> Deref for BlockLease<'c, 'a> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
@@ -83,11 +83,11 @@ pub(crate) enum BlockReaderRef<'a> {

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    async fn read_blk(
+    async fn read_blk<'c>(
        &self,
        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+        ctx: &'c RequestContext,
+    ) -> Result<BlockLease<'c, '_>, std::io::Error> {
        use BlockReaderRef::*;
        match self {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
@@ -141,11 +141,11 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub async fn read_blk(
+    pub async fn read_blk<'c>(
        &self,
        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+        ctx: &'c RequestContext,
+    ) -> Result<BlockLease<'c, '_>, std::io::Error> {
        self.reader.read_blk(blknum, ctx).await
    }
 }
@@ -180,32 +180,27 @@ impl FileBlockReader {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(
+    pub async fn read_blk<'c>(
        &self,
        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+        ctx: &'c RequestContext,
+    ) -> Result<BlockLease<'c, 'static>, std::io::Error> {
        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_immutable_buf(self.file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        format!("Failed to read immutable buf: {e:#}"),
-                    )
-                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+        match cache
+            .read_immutable_buf(self.file_id, blknum, ctx)
+            .await
+            .map_err(|e| {
+                std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!("Failed to read immutable buf: {e:#}"),
+                )
+            })? {
+            ReadBufResult::Found(guard) => Ok(guard.into()),
+            ReadBufResult::NotFound(mut write_guard) => {
+                // Read the page from disk into the buffer
+                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                Ok(write_guard.mark_valid().into())
+            }
        }
    }
 }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -64,44 +64,40 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) async fn read_blk(
+    pub(crate) async fn read_blk<'c>(
        &self,
        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
+        ctx: &'c RequestContext,
+    ) -> Result<BlockLease<'c, '_>, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum, self.file.path, e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                    let buf: &mut [u8] = write_guard.deref_mut();
+                    debug_assert_eq!(buf.len(), PAGE_SZ);
+                    self.file
+                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -171,7 +167,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        write_guard.mark_valid();
+                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -549,7 +549,7 @@ impl DeltaLayer {
    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+    pub(crate) async fn load_keys<'c>(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
@@ -1038,9 +1038,9 @@ pub struct ValueRef<'a> {
    reader: BlockCursor<'a>,
 }

-impl<'a> ValueRef<'a> {
+impl<'c, 'a> ValueRef<'a> {
    /// Loads the value from disk
-    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
+    pub async fn load(&self, ctx: &'c RequestContext) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
        let val = Value::des(&buf)?;
@@ -1051,11 +1051,11 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) async fn read_blk(
+    pub(crate) async fn read_blk<'c>(
        &self,
        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+        ctx: &'c RequestContext,
+    ) -> Result<BlockLease<'c, '_>, std::io::Error> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -158,7 +158,7 @@ pub struct Timeline {

    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
    /// Never changes for the lifetime of this [`Timeline`] object.
-    ///  
+    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,
@@ -505,7 +505,7 @@ impl Timeline {
        timer.stop_and_record();

        let start = Instant::now();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
+        let res = self.reconstruct_value(key, lsn, reconstruct_state, ctx).await;
        let elapsed = start.elapsed();
        crate::metrics::RECONSTRUCT_TIME
            .for_result(&res)
@@ -4279,6 +4279,7 @@ impl Timeline {
        key: Key,
        request_lsn: Lsn,
        mut data: ValueReconstructState,
+        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        // Perform WAL redo if needed
        data.records.reverse();
@@ -4342,6 +4343,7 @@ impl Timeline {
                            key,
                            last_rec_lsn,
                            &img,
+                            ctx,
                        )
                        .await
                        .context("Materialized page memoization failed")
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -16,7 +16,7 @@
 use std::{
    collections::HashMap,
    ops::ControlFlow,
-    sync::Arc,
+    sync::{Arc, Mutex},
    time::{Duration, SystemTime},
 };

@@ -25,7 +25,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
-    context::{DownloadBehavior, RequestContext},
+    context::{DownloadBehavior, RequestContext, RequestContextBuilder},
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -397,9 +397,14 @@ impl Timeline {
            }
        }

+        let permit = crate::page_cache::get().get_permit().await;
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_cache_permit(permit)
+            .build();
+
        // imitiate repartiting on first compactation
        if let Err(e) = self
-            .collect_keyspace(lsn, ctx)
+            .collect_keyspace(lsn, &ctx)
            .instrument(info_span!("collect_keyspace"))
            .await
        {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -544,7 +544,7 @@ impl VirtualFile {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
-    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+    ) -> Result<crate::tenant::block_io::BlockLease<'_, '_>, std::io::Error> {
        use crate::page_cache::PAGE_SZ;
        let mut buf = [0; PAGE_SZ];
        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -7,12 +7,12 @@ OBJS = \
 	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
+	libpqwalproposer.o \
 	neon.o \
-	neon_utils.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_pg.o \
+	walproposer_utils.o \
 	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,7 +30,7 @@

 #include "neon.h"
 #include "walproposer.h"
-#include "neon_utils.h"
+#include "walproposer_utils.h"

 #define PageStoreTrace DEBUG5

--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -0,0 +1,424 @@
+#include "postgres.h"
+
+#include "libpq-fe.h"
+#include "neon.h"
+#include "walproposer.h"
+
+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received data from
+								 * walprop_async_read */
+};
+
+/* Helper function */
+static bool
+ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
+{
+	/* If we're already correctly blocking or nonblocking, all good */
+	if (is_nonblocking == conn->is_nonblocking)
+		return true;
+
+	/* Otherwise, set it appropriately */
+	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
+		return false;
+
+	conn->is_nonblocking = is_nonblocking;
+	return true;
+}
+
+/* Exported function definitions */
+char *
+walprop_error_message(WalProposerConn *conn)
+{
+	return PQerrorMessage(conn->pg_conn);
+}
+
+WalProposerConnStatusType
+walprop_status(WalProposerConn *conn)
+{
+	switch (PQstatus(conn->pg_conn))
+	{
+		case CONNECTION_OK:
+			return WP_CONNECTION_OK;
+		case CONNECTION_BAD:
+			return WP_CONNECTION_BAD;
+		default:
+			return WP_CONNECTION_IN_PROGRESS;
+	}
+}
+
+WalProposerConn *
+walprop_connect_start(char *conninfo, char *password)
+{
+	WalProposerConn *conn;
+	PGconn	   *pg_conn;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;
+
+	/*
+	 * Connect using the given connection string. If the
+	 * NEON_AUTH_TOKEN environment variable was set, use that as
+	 * the password.
+	 *
+	 * The connection options are parsed in the order they're given, so
+	 * when we set the password before the connection string, the
+	 * connection string can override the password from the env variable.
+	 * Seems useful, although we don't currently use that capability
+	 * anywhere.
+	 */
+	n = 0;
+	if (password)
+	{
+		keywords[n] = "password";
+		values[n] = password;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = conninfo;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	pg_conn = PQconnectStartParams(keywords, values, 1);
+
+	/*
+	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
+	 * replicate the behavior of PQconnectStart here.
+	 */
+	if (!pg_conn)
+		return NULL;
+
+	/*
+	 * And in theory this allocation can fail as well, but it's incredibly
+	 * unlikely if we just successfully allocated a PGconn.
+	 *
+	 * palloc will exit on failure though, so there's not much we could do if
+	 * it *did* fail.
+	 */
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false;	/* connections always start in blocking
+									 * mode */
+	conn->recvbuf = NULL;
+	return conn;
+}
+
+WalProposerConnectPollStatusType
+walprop_connect_poll(WalProposerConn *conn)
+{
+	WalProposerConnectPollStatusType return_val;
+
+	switch (PQconnectPoll(conn->pg_conn))
+	{
+		case PGRES_POLLING_FAILED:
+			return_val = WP_CONN_POLLING_FAILED;
+			break;
+		case PGRES_POLLING_READING:
+			return_val = WP_CONN_POLLING_READING;
+			break;
+		case PGRES_POLLING_WRITING:
+			return_val = WP_CONN_POLLING_WRITING;
+			break;
+		case PGRES_POLLING_OK:
+			return_val = WP_CONN_POLLING_OK;
+			break;
+
+			/*
+			 * There's a comment at its source about this constant being
+			 * unused. We'll expect it's never returned.
+			 */
+		case PGRES_POLLING_ACTIVE:
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+
+			/*
+			 * This return is never actually reached, but it's here to make
+			 * the compiler happy
+			 */
+			return WP_CONN_POLLING_FAILED;
+
+		default:
+			Assert(false);
+			return_val = WP_CONN_POLLING_FAILED;	/* keep the compiler quiet */
+	}
+
+	return return_val;
+}
+
+bool
+walprop_send_query(WalProposerConn *conn, char *query)
+{
+	/*
+	 * We need to be in blocking mode for sending the query to run without
+	 * requiring a call to PQflush
+	 */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	/* PQsendQuery returns 1 on success, 0 on failure */
+	if (!PQsendQuery(conn->pg_conn, query))
+		return false;
+
+	return true;
+}
+
+WalProposerExecStatusType
+walprop_get_query_result(WalProposerConn *conn)
+{
+	PGresult   *result;
+	WalProposerExecStatusType return_val;
+
+	/* Marker variable if we need to log an unexpected success result */
+	char	   *unexpected_success = NULL;
+
+	/* Consume any input that we might be missing */
+	if (!PQconsumeInput(conn->pg_conn))
+		return WP_EXEC_FAILED;
+
+	if (PQisBusy(conn->pg_conn))
+		return WP_EXEC_NEEDS_INPUT;
+
+
+	result = PQgetResult(conn->pg_conn);
+
+	/*
+	 * PQgetResult returns NULL only if getting the result was successful &
+	 * there's no more of the result to get.
+	 */
+	if (!result)
+	{
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		return WP_EXEC_UNEXPECTED_SUCCESS;
+	}
+
+	/* Helper macro to reduce boilerplate */
+#define UNEXPECTED_SUCCESS(msg) \
+		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
+		unexpected_success = msg; \
+		break;
+
+
+	switch (PQresultStatus(result))
+	{
+			/* "true" success case */
+		case PGRES_COPY_BOTH:
+			return_val = WP_EXEC_SUCCESS_COPYBOTH;
+			break;
+
+			/* Unexpected success case */
+		case PGRES_EMPTY_QUERY:
+			UNEXPECTED_SUCCESS("empty query return");
+		case PGRES_COMMAND_OK:
+			UNEXPECTED_SUCCESS("data-less command end");
+		case PGRES_TUPLES_OK:
+			UNEXPECTED_SUCCESS("tuples return");
+		case PGRES_COPY_OUT:
+			UNEXPECTED_SUCCESS("'Copy Out' response");
+		case PGRES_COPY_IN:
+			UNEXPECTED_SUCCESS("'Copy In' response");
+		case PGRES_SINGLE_TUPLE:
+			UNEXPECTED_SUCCESS("single tuple return");
+		case PGRES_PIPELINE_SYNC:
+			UNEXPECTED_SUCCESS("pipeline sync point");
+
+			/* Failure cases */
+		case PGRES_BAD_RESPONSE:
+		case PGRES_NONFATAL_ERROR:
+		case PGRES_FATAL_ERROR:
+		case PGRES_PIPELINE_ABORTED:
+			return_val = WP_EXEC_FAILED;
+			break;
+
+		default:
+			Assert(false);
+			return_val = WP_EXEC_FAILED;	/* keep the compiler quiet */
+	}
+
+	if (unexpected_success)
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+
+	return return_val;
+}
+
+pgsocket
+walprop_socket(WalProposerConn *conn)
+{
+	return PQsocket(conn->pg_conn);
+}
+
+int
+walprop_flush(WalProposerConn *conn)
+{
+	return (PQflush(conn->pg_conn));
+}
+
+void
+walprop_finish(WalProposerConn *conn)
+{
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+PGAsyncReadResult
+walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
+{
+	int			result;
+
+	if (conn->recvbuf != NULL)
+	{
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
+	}
+
+	/* Call PQconsumeInput so that we have the data we need */
+	if (!PQconsumeInput(conn->pg_conn))
+	{
+		*amount = 0;
+		*buf = NULL;
+		return PG_ASYNC_READ_FAIL;
+	}
+
+	/*
+	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
+	 * still in progress, but no "complete row" is available -1 if the copy is
+	 * done -2 if an error occurred (> 0) if it was successful; that value is
+	 * the amount transferred.
+	 *
+	 * The protocol we use between walproposer and safekeeper means that we
+	 * *usually* wouldn't expect to see that the copy is done, but this can
+	 * sometimes be triggered by the server returning an ErrorResponse (which
+	 * also happens to have the effect that the copy is done).
+	 */
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
+	{
+		case 0:
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_TRY_AGAIN;
+		case -1:
+			{
+				/*
+				 * If we get -1, it's probably because of a server error; the
+				 * safekeeper won't normally send a CopyDone message.
+				 *
+				 * We can check PQgetResult to make sure that the server
+				 * failed; it'll always result in PGRES_FATAL_ERROR
+				 */
+				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+
+				if (status != PGRES_FATAL_ERROR)
+					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+
+				/*
+				 * If there was actually an error, it'll be properly reported
+				 * by calls to PQerrorMessage -- we don't have to do anything
+				 * else
+				 */
+				*amount = 0;
+				*buf = NULL;
+				return PG_ASYNC_READ_FAIL;
+			}
+		case -2:
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		default:
+			/* Positive values indicate the size of the returned result */
+			*amount = result;
+			*buf = conn->recvbuf;
+			return PG_ASYNC_READ_SUCCESS;
+	}
+}
+
+PGAsyncWriteResult
+walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
+{
+	int			result;
+
+	/* If we aren't in non-blocking mode, switch to it. */
+	if (!ensure_nonblocking_status(conn, true))
+		return PG_ASYNC_WRITE_FAIL;
+
+	/*
+	 * The docs for PQputcopyData list the return values as: 1 if the data was
+	 * queued, 0 if it was not queued because of full buffers, or -1 if an
+	 * error occurred
+	 */
+	result = PQputCopyData(conn->pg_conn, buf, size);
+
+	/*
+	 * We won't get a result of zero because walproposer always empties the
+	 * connection's buffers before sending more
+	 */
+	Assert(result != 0);
+
+	switch (result)
+	{
+		case 1:
+			/* good -- continue */
+			break;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
+	}
+
+	/*
+	 * After queueing the data, we still need to flush to get it to send. This
+	 * might take multiple tries, but we don't want to wait around until it's
+	 * done.
+	 *
+	 * PQflush has the following returns (directly quoting the docs): 0 if
+	 * sucessful, 1 if it was unable to send all the data in the send queue
+	 * yet -1 if it failed for some reason
+	 */
+	switch (result = PQflush(conn->pg_conn))
+	{
+		case 0:
+			return PG_ASYNC_WRITE_SUCCESS;
+		case 1:
+			return PG_ASYNC_WRITE_TRY_FLUSH;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQflush", result);
+	}
+}
+
+/*
+ * This function is very similar to walprop_async_write. For more
+ * information, refer to the comments there.
+ */
+bool
+walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
+{
+	int			result;
+
+	/* If we are in non-blocking mode, switch out of it. */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
+		return false;
+
+	Assert(result == 1);
+
+	/* Because the connection is non-blocking, flushing returns 0 or -1 */
+
+	if ((result = PQflush(conn->pg_conn)) == -1)
+		return false;
+
+	Assert(result == 0);
+	return true;
+}
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -18,10 +18,6 @@ extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;

-extern char *wal_acceptors_list;
-extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connection_timeout;
-
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

@@ -34,10 +30,4 @@ extern void pg_init_extension_server(void);
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

-extern uint64 BackpressureThrottlingTime(void);
-extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
-
-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-
 #endif							/* NEON_H */
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,116 +0,0 @@
-#include "postgres.h"
-
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
-#include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
-
-/*
- * Convert a character which represents a hexadecimal digit to an integer.
- *
- * Returns -1 if the character is not a hexadecimal digit.
- */
-int
-HexDecodeChar(char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 10;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 10;
-
-	return -1;
-}
-
-/*
- * Decode a hex string into a byte string, 2 hex chars per byte.
- *
- * Returns false if invalid characters are encountered; otherwise true.
- */
-bool
-HexDecodeString(uint8 *result, char *input, int nbytes)
-{
-	int			i;
-
-	for (i = 0; i < nbytes; ++i)
-	{
-		int			n1 = HexDecodeChar(input[i * 2]);
-		int			n2 = HexDecodeChar(input[i * 2 + 1]);
-
-		if (n1 < 0 || n2 < 0)
-			return false;
-		result[i] = n1 * 16 + n2;
-	}
-
-	return true;
-}
-
-/* --------------------------------
- *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint32
-pq_getmsgint32_le(StringInfo msg)
-{
-	uint32		n32;
-
-	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
-
-	return n32;
-}
-
-/* --------------------------------
- *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
- * --------------------------------
- */
-uint64
-pq_getmsgint64_le(StringInfo msg)
-{
-	uint64		n64;
-
-	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
-
-	return n64;
-}
-
-/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint32_le(StringInfo buf, uint32 i)
-{
-	enlargeStringInfo(buf, sizeof(uint32));
-	memcpy(buf->data + buf->len, &i, sizeof(uint32));
-	buf->len += sizeof(uint32);
-}
-
-/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
-void
-pq_sendint64_le(StringInfo buf, uint64 i)
-{
-	enlargeStringInfo(buf, sizeof(uint64));
-	memcpy(buf->data + buf->len, &i, sizeof(uint64));
-	buf->len += sizeof(uint64);
-}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,12 +0,0 @@
-#ifndef __NEON_UTILS_H__
-#define __NEON_UTILS_H__
-
-#include "postgres.h"
-
-bool		HexDecodeString(uint8 *result, char *input, int nbytes);
-uint32		pq_getmsgint32_le(StringInfo msg);
-uint64		pq_getmsgint64_le(StringInfo msg);
-void		pq_sendint32_le(StringInfo buf, uint32 i);
-void		pq_sendint64_le(StringInfo buf, uint64 i);
-
-#endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,8 +1,8 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__

-#include "postgres.h"
 #include "access/xlogdefs.h"
+#include "postgres.h"
 #include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
@@ -16,15 +16,29 @@
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
+#define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
+#define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
+								 * message header */
+#define XLOG_HDR_END_POS (1 + 8)	/* offset of end position in wal sender*
+									 * message header */
+
 /*
 * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0

-struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
+extern char *wal_acceptors_list;
+extern int	wal_acceptor_reconnect_timeout;
+extern int	wal_acceptor_connection_timeout;
+extern bool am_wal_proposer;
+
+struct WalProposerConn;			/* Defined in libpqwalproposer */
 typedef struct WalProposerConn WalProposerConn;

+struct WalMessage;
+typedef struct WalMessage WalMessage;
+
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -38,7 +52,7 @@ typedef enum
 	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
+}			PGAsyncReadResult;

 /* Possible return values from WritePGAsync */
 typedef enum
@@ -57,7 +71,7 @@ typedef enum
 	PG_ASYNC_WRITE_TRY_FLUSH,
 	/* Writing failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
+}			PGAsyncWriteResult;

 /*
 * WAL safekeeper state, which is used to wait for some event.
@@ -133,7 +147,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-} SafekeeperState;
+}			SafekeeperState;

 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -157,12 +171,12 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-} ProposerGreeting;
+}			ProposerGreeting;

 typedef struct AcceptorProposerMessage
 {
 	uint64		tag;
-} AcceptorProposerMessage;
+}			AcceptorProposerMessage;

 /*
 * Acceptor -> Proposer initial response: the highest term acceptor voted for.
@@ -172,7 +186,7 @@ typedef struct AcceptorGreeting
 	AcceptorProposerMessage apm;
 	term_t		term;
 	NNodeId		nodeId;
-} AcceptorGreeting;
+}			AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
@@ -182,20 +196,20 @@ typedef struct VoteRequest
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} VoteRequest;
+}			VoteRequest;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
 {
 	term_t		term;
 	XLogRecPtr	lsn;
-} TermSwitchEntry;
+}			TermSwitchEntry;

 typedef struct TermHistory
 {
 	uint32		n_entries;
 	TermSwitchEntry *entries;
-} TermHistory;
+}			TermHistory;

 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse
@@ -213,7 +227,7 @@ typedef struct VoteResponse
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
 	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
-} VoteResponse;
+}			VoteResponse;

 /*
 * Proposer -> Acceptor message announcing proposer is elected and communicating
@@ -229,7 +243,7 @@ typedef struct ProposerElected
 	TermHistory *termHistory;
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
-} ProposerElected;
+}			ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
@@ -254,7 +268,7 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} AppendRequestHeader;
+}			AppendRequestHeader;

 /*
 * Hot standby feedback received from replica
@@ -264,7 +278,7 @@ typedef struct HotStandbyFeedback
 	TimestampTz ts;
 	FullTransactionId xmin;
 	FullTransactionId catalog_xmin;
-} HotStandbyFeedback;
+}			HotStandbyFeedback;

 typedef struct PageserverFeedback
 {
@@ -275,7 +289,7 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
-} PageserverFeedback;
+}			PageserverFeedback;

 typedef struct WalproposerShmemState
 {
@@ -283,7 +297,7 @@ typedef struct WalproposerShmemState
 	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
-} WalproposerShmemState;
+}			WalproposerShmemState;

 /*
 * Report safekeeper state to proposer
@@ -307,22 +321,17 @@ typedef struct AppendResponse
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
-} AppendResponse;
+}			AppendResponse;

 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)

-struct WalProposer;
-typedef struct WalProposer WalProposer;
-
 /*
 * Descriptor of safekeeper
 */
 typedef struct Safekeeper
 {
-	WalProposer *wp;
-
 	char const *host;
 	char const *port;

@@ -331,7 +340,7 @@ typedef struct Safekeeper
 	 *
 	 * May contain private information like password and should not be logged.
 	 */
-	char		conninfo[MAXCONNINFO];
+	char conninfo[MAXCONNINFO];

 	/*
 	 * postgres protocol connection to the WAL acceptor
@@ -364,12 +373,27 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
+	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

+extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
+extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(void);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+											PageserverFeedback *rf);
+extern void StartProposerReplication(StartReplicationCmd *cmd);
+
+extern Size WalproposerShmemSize(void);
+extern bool WalproposerShmemInit(void);
+extern void replication_feedback_set(PageserverFeedback *rf);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+
+/* libpqwalproposer hooks & helper type */
+
 /* Re-exported PostgresPollingStatusType */
 typedef enum
 {
@@ -382,7 +406,7 @@ typedef enum
 	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
 	 * We've removed it here to avoid clutter.
 	 */
-} WalProposerConnectPollStatusType;
+}			WalProposerConnectPollStatusType;

 /* Re-exported and modified ExecStatusType */
 typedef enum
@@ -407,7 +431,7 @@ typedef enum
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
+}			WalProposerExecStatusType;

 /* Re-exported ConnStatusType */
 typedef enum
@@ -421,252 +445,67 @@ typedef enum
 	 * that extra functionality, so we collect them into a single tag here.
 	 */
 	WP_CONNECTION_IN_PROGRESS,
-} WalProposerConnStatusType;
+}			WalProposerConnStatusType;
+
+/* Re-exported PQerrorMessage */
+extern char *walprop_error_message(WalProposerConn *conn);
+
+/* Re-exported PQstatus */
+extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
+
+/* Re-exported PQconnectStart */
+extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
+
+/* Re-exported PQconectPoll */
+extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
+
+/* Blocking wrapper around PQsendQuery */
+extern bool walprop_send_query(WalProposerConn *conn, char *query);
+
+/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
+extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
+
+/* Re-exported PQsocket */
+extern pgsocket walprop_socket(WalProposerConn *conn);
+
+/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
+extern int	walprop_flush(WalProposerConn *conn);
+
+/* Re-exported PQfinish */
+extern void walprop_finish(WalProposerConn *conn);

 /*
- * Collection of hooks for walproposer, to call postgres functions,
- * read WAL and send it over the network.
+ * Ergonomic wrapper around PGgetCopyData
+ *
+ * Reads a CopyData block from a safekeeper, setting *amount to the number
+ * of bytes returned.
+ *
+ * This function is allowed to assume certain properties specific to the
+ * protocol with the safekeepers, so it should not be used as-is for any
+ * other purpose.
+ *
+ * Note: If possible, using <AsyncRead> is generally preferred, because it
+ * performs a bit of extra checking work that's always required and is normally
+ * somewhat verbose.
 */
-typedef struct walproposer_api
-{
-	/*
-	 * Get WalproposerShmemState. This is used to store information about last
-	 * elected term.
-	 */
-	WalproposerShmemState *(*get_shmem_state) (void);
-
-	/*
-	 * Start receiving notifications about new WAL. This is an infinite loop
-	 * which calls WalProposerBroadcast() and WalProposerPoll() to send the
-	 * WAL.
-	 */
-	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
-
-	/* Get pointer to the latest available WAL. */
-	XLogRecPtr	(*get_flush_rec_ptr) (void);
-
-	/* Get current time. */
-	TimestampTz (*get_current_timestamp) (void);
-
-	/* Get postgres timeline. */
-	TimeLineID	(*get_timeline_id) (void);
-
-	/* Current error message, aka PQerrorMessage. */
-	char	   *(*conn_error_message) (WalProposerConn *conn);
-
-	/* Connection status, aka PQstatus. */
-	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
-
-	/* Start the connection, aka PQconnectStart. */
-	WalProposerConn *(*conn_connect_start) (char *conninfo);
-
-	/* Poll an asynchronous connection, aka PQconnectPoll. */
-	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
-
-	/* Send a blocking SQL query, aka PQsendQuery. */
-	bool		(*conn_send_query) (WalProposerConn *conn, char *query);
-
-	/* Read the query result, aka PQgetResult. */
-	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
-
-	/* Flush buffer to the network, aka PQflush. */
-	int			(*conn_flush) (WalProposerConn *conn);
-
-	/* Close the connection, aka PQfinish. */
-	void		(*conn_finish) (WalProposerConn *conn);
-
-	/* Try to read CopyData message, aka PQgetCopyData. */
-	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
-
-	/* Try to write CopyData message, aka PQputCopyData. */
-	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
-
-	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
-	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
-
-	/* Download WAL from startpos to endpos and make it available locally. */
-	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
-
-	/* Read WAL from disk to buf. */
-	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
-
-	/* Allocate WAL reader. */
-	XLogReaderState *(*wal_reader_allocate) (void);
-
-	/* Deallocate event set. */
-	void		(*free_event_set) (void);
-
-	/* Initialize event set. */
-	void		(*init_event_set) (int n_safekeepers);
-
-	/* Update events for an existing safekeeper connection. */
-	void		(*update_event_set) (Safekeeper *sk, uint32 events);
-
-	/* Add a new safekeeper connection to the event set. */
-	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
-
-	/*
-	 * Wait until some event happens: - timeout is reached - socket event for
-	 * safekeeper connection - new WAL is available
-	 *
-	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
-	 * events mask to indicate events and sets sk to the safekeeper which has
-	 * an event.
-	 */
-	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
-
-	/* Read random bytes. */
-	bool		(*strong_random) (void *buf, size_t len);
-
-	/*
-	 * Get a basebackup LSN. Used to cross-validate with the latest available
-	 * LSN on the safekeepers.
-	 */
-	XLogRecPtr	(*get_redo_start_lsn) (void);
-
-	/*
-	 * Finish sync safekeepers with the given LSN. This function should not
-	 * return and should exit the program.
-	 */
-	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);
-
-	/*
-	 * Called after every new message from the safekeeper. Used to propagate
-	 * backpressure feedback and to confirm WAL persistence (has been commited
-	 * on the quorum of safekeepers).
-	 */
-	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
-
-	/*
-	 * Called on peer_horizon_lsn updates. Used to advance replication slot
-	 * and to free up disk space by deleting unnecessary WAL.
-	 */
-	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
-} walproposer_api;
+extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);

 /*
- * Configuration of the WAL proposer.
+ * Ergonomic wrapper around PQputCopyData + PQflush
+ *
+ * Starts to write a CopyData block to a safekeeper.
+ *
+ * For information on the meaning of return codes, refer to PGAsyncWriteResult.
 */
-typedef struct WalProposerConfig
-{
-	/* hex-encoded TenantId cstr */
-	char	   *neon_tenant;
-
-	/* hex-encoded TimelineId cstr */
-	char	   *neon_timeline;
-
-	/*
-	 * Comma-separated list of safekeepers, in the following format:
-	 * host1:port1,host2:port2,host3:port3
-	 *
-	 * This cstr should be editable.
-	 */
-	char	   *safekeepers_list;
-
-	/*
-	 * WalProposer reconnects to offline safekeepers once in this interval.
-	 * Time is in milliseconds.
-	 */
-	int			safekeeper_reconnect_timeout;
-
-	/*
-	 * WalProposer terminates the connection if it doesn't receive any message
-	 * from the safekeeper in this interval. Time is in milliseconds.
-	 */
-	int			safekeeper_connection_timeout;
-
-	/*
-	 * WAL segment size. Will be passed to safekeepers in greet request. Also
-	 * used to detect page headers.
-	 */
-	int			wal_segment_size;
-
-	/*
-	 * If safekeeper was started in sync mode, walproposer will not subscribe
-	 * for new WAL and will exit when quorum of safekeepers will be synced to
-	 * the latest available LSN.
-	 */
-	bool		syncSafekeepers;
-
-	/* Will be passed to safekeepers in greet request. */
-	uint64		systemId;
-} WalProposerConfig;
-
+extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);

 /*
- * WAL proposer state.
+ * Blocking equivalent to walprop_async_write_fn
+ *
+ * Returns 'true' if successful, 'false' on failure.
 */
-typedef struct WalProposer
-{
-	WalProposerConfig *config;
-	int			n_safekeepers;
+extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);

-	/* (n_safekeepers / 2) + 1 */
-	int			quorum;
-
-	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
-
-	/* WAL has been generated up to this point */
-	XLogRecPtr	availableLsn;
-
-	/* last commitLsn broadcasted to safekeepers */
-	XLogRecPtr	lastSentCommitLsn;
-
-	ProposerGreeting greetRequest;
-
-	/* Vote request for safekeeper */
-	VoteRequest voteRequest;
-
-	/*
-	 * Minimal LSN which may be needed for recovery of some safekeeper,
-	 * record-aligned (first record which might not yet received by someone).
-	 */
-	XLogRecPtr	truncateLsn;
-
-	/*
-	 * Term of the proposer. We want our term to be highest and unique, so we
-	 * collect terms from safekeepers quorum, choose max and +1. After that
-	 * our term is fixed and must not change. If we observe that some
-	 * safekeeper has higher term, it means that we have another running
-	 * compute, so we must stop immediately.
-	 */
-	term_t		propTerm;
-
-	/* term history of the proposer */
-	TermHistory propTermHistory;
-
-	/* epoch start lsn of the proposer */
-	XLogRecPtr	propEpochStartLsn;
-
-	/* Most advanced acceptor epoch */
-	term_t		donorEpoch;
-
-	/* Most advanced acceptor */
-	int			donor;
-
-	/* timeline globally starts at this LSN */
-	XLogRecPtr	timelineStartLsn;
-
-	/* number of votes collected from safekeepers */
-	int			n_votes;
-
-	/* number of successful connections over the lifetime of walproposer */
-	int			n_connected;
-
-	/*
-	 * Timestamp of the last reconnection attempt. Related to
-	 * config->safekeeper_reconnect_timeout
-	 */
-	TimestampTz last_reconnect_attempt;
-
-	walproposer_api api;
-} WalProposer;
-
-extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api);
-extern void WalProposerStart(WalProposer *wp);
-extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
-extern void WalProposerPoll(WalProposer *wp);
-extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
-										   PageserverFeedback *rf);
+extern uint64 BackpressureThrottlingTime(void);

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -0,0 +1,659 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "walproposer_utils.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogutils.h"
+#include "access/xlogrecovery.h"
+#endif
+#if PG_MAJORVERSION_NUM >= 16
+#include "utils/guc.h"
+#endif
+
+/*
+ * These variables are used similarly to openLogFile/SegNo,
+ * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
+ * corresponding the filename of walpropFile.
+ */
+static int	walpropFile = -1;
+static TimeLineID walpropFileTLI = 0;
+static XLogSegNo walpropSegNo = 0;
+
+/* START cloned file-local variables and functions from walsender.c */
+
+/*
+ * How far have we sent WAL already? This is also advertised in
+ * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
+ */
+static XLogRecPtr sentPtr = InvalidXLogRecPtr;
+
+static void WalSndLoop(void);
+static void XLogBroadcastWalProposer(void);
+/* END cloned file-level variables and functions from walsender.c */
+
+int
+CompareLsn(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return -1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return 1;
+}
+
+/* Returns a human-readable string corresonding to the SafekeeperState
+ *
+ * The string should not be freed.
+ *
+ * The strings are intended to be used as a prefix to "state", e.g.:
+ *
+ *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *
+ * If this sort of phrasing doesn't fit the message, instead use something like:
+ *
+ *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ */
+char *
+FormatSafekeeperState(SafekeeperState state)
+{
+	char	   *return_val = NULL;
+
+	switch (state)
+	{
+		case SS_OFFLINE:
+			return_val = "offline";
+			break;
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			return_val = "connecting";
+			break;
+		case SS_WAIT_EXEC_RESULT:
+			return_val = "receiving query result";
+			break;
+		case SS_HANDSHAKE_RECV:
+			return_val = "handshake (receiving)";
+			break;
+		case SS_VOTING:
+			return_val = "voting";
+			break;
+		case SS_WAIT_VERDICT:
+			return_val = "wait-for-verdict";
+			break;
+		case SS_SEND_ELECTED_FLUSH:
+			return_val = "send-announcement-flush";
+			break;
+		case SS_IDLE:
+			return_val = "idle";
+			break;
+		case SS_ACTIVE:
+			return_val = "active";
+			break;
+	}
+
+	Assert(return_val != NULL);
+
+	return return_val;
+}
+
+/* Asserts that the provided events are expected for given safekeeper's state */
+void
+AssertEventsOkForState(uint32 events, Safekeeper *sk)
+{
+	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
+
+	/*
+	 * The events are in-line with what we're expecting, under two conditions:
+	 * (a) if we aren't expecting anything, `events` has no read- or
+	 * write-ready component. (b) if we are expecting something, there's
+	 * overlap (i.e. `events & expected != 0`)
+	 */
+	bool		events_ok_for_state;	/* long name so the `Assert` is more
+										 * clear later */
+
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);
+
+	if (!events_ok_for_state)
+	{
+		/*
+		 * To give a descriptive message in the case of failure, we use elog
+		 * and then an assertion that's guaranteed to fail.
+		 */
+		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+		Assert(events_ok_for_state);
+	}
+}
+
+/* Returns the set of events a safekeeper in this state should be waiting on
+ *
+ * This will return WL_NO_EVENTS (= 0) for some events. */
+uint32
+SafekeeperStateDesiredEvents(SafekeeperState state)
+{
+	uint32		result = WL_NO_EVENTS;
+
+	/* If the state doesn't have a modifier, we can check the base state */
+	switch (state)
+	{
+			/* Connecting states say what they want in the name */
+		case SS_CONNECTING_READ:
+			result = WL_SOCKET_READABLE;
+			break;
+		case SS_CONNECTING_WRITE:
+			result = WL_SOCKET_WRITEABLE;
+			break;
+
+			/* Reading states need the socket to be read-ready to continue */
+		case SS_WAIT_EXEC_RESULT:
+		case SS_HANDSHAKE_RECV:
+		case SS_WAIT_VERDICT:
+			result = WL_SOCKET_READABLE;
+			break;
+
+			/*
+			 * Idle states use read-readiness as a sign that the connection
+			 * has been disconnected.
+			 */
+		case SS_VOTING:
+		case SS_IDLE:
+			result = WL_SOCKET_READABLE;
+			break;
+
+			/*
+			 * Flush states require write-ready for flushing. Active state
+			 * does both reading and writing.
+			 *
+			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
+			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
+			 */
+		case SS_SEND_ELECTED_FLUSH:
+		case SS_ACTIVE:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
+			/* The offline state expects no events. */
+		case SS_OFFLINE:
+			result = WL_NO_EVENTS;
+			break;
+
+		default:
+			Assert(false);
+			break;
+	}
+
+	return result;
+}
+
+/* Returns a human-readable string corresponding to the event set
+ *
+ * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
+ * returned string may be meaingless.
+ *
+ * The string should not be freed. It should also not be expected to remain the same between
+ * function calls. */
+char *
+FormatEvents(uint32 events)
+{
+	static char return_str[8];
+
+	/* Helper variable to check if there's extra bits */
+	uint32		all_flags = WL_LATCH_SET
+	| WL_SOCKET_READABLE
+	| WL_SOCKET_WRITEABLE
+	| WL_TIMEOUT
+	| WL_POSTMASTER_DEATH
+	| WL_EXIT_ON_PM_DEATH
+	| WL_SOCKET_CONNECTED;
+
+	/*
+	 * The formatting here isn't supposed to be *particularly* useful -- it's
+	 * just to give an sense of what events have been triggered without
+	 * needing to remember your powers of two.
+	 */
+
+	return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_';
+	return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_';
+	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
+	return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_';
+	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
+	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
+	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
+
+	if (events & (~all_flags))
+	{
+		elog(WARNING, "Event formatting found unexpected component %d",
+			 events & (~all_flags));
+		return_str[6] = '*';
+		return_str[7] = '\0';
+	}
+	else
+		return_str[6] = '\0';
+
+	return (char *) &return_str;
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
+
+/*
+ * Write XLOG data to disk.
+ */
+void
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
+{
+	int			startoff;
+	int			byteswritten;
+
+	while (nbytes > 0)
+	{
+		int			segbytes;
+
+		/* Close the current segment if it's completed */
+		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+			XLogWalPropClose(recptr);
+
+		if (walpropFile < 0)
+		{
+#if PG_VERSION_NUM >= 150000
+			/* FIXME Is it ok to use hardcoded value here? */
+			TimeLineID	tli = 1;
+#else
+			bool		use_existent = true;
+#endif
+			/* Create/use new log file */
+			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
+#if PG_VERSION_NUM >= 150000
+			walpropFile = XLogFileInit(walpropSegNo, tli);
+			walpropFileTLI = tli;
+#else
+			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
+			walpropFileTLI = ThisTimeLineID;
+#endif
+		}
+
+		/* Calculate the start offset of the received logs */
+		startoff = XLogSegmentOffset(recptr, wal_segment_size);
+
+		if (startoff + nbytes > wal_segment_size)
+			segbytes = wal_segment_size - startoff;
+		else
+			segbytes = nbytes;
+
+		/* OK to write the logs */
+		errno = 0;
+
+		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
+		if (byteswritten <= 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno;
+
+			/* if write didn't set errno, assume no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+
+			save_errno = errno;
+			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write to log segment %s "
+							"at offset %u, length %lu: %m",
+							xlogfname, startoff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for write */
+		recptr += byteswritten;
+
+		nbytes -= byteswritten;
+		buf += byteswritten;
+	}
+
+	/*
+	 * Close the current segment if it's fully written up in the last cycle of
+	 * the loop.
+	 */
+	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+	{
+		XLogWalPropClose(recptr);
+	}
+}
+
+/*
+ * Close the current segment.
+ */
+void
+XLogWalPropClose(XLogRecPtr recptr)
+{
+	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
+
+	if (close(walpropFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+
+		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close log segment %s: %m",
+						xlogfname)));
+	}
+
+	walpropFile = -1;
+}
+
+/* START of cloned functions from walsender.c */
+
+/*
+ * Subscribe for new WAL and stream it in the loop to safekeepers.
+ *
+ * At the moment, this never returns, but an ereport(ERROR) will take us back
+ * to the main loop.
+ */
+void
+StartProposerReplication(StartReplicationCmd *cmd)
+{
+	XLogRecPtr	FlushPtr;
+	TimeLineID	currTLI;
+
+#if PG_VERSION_NUM < 150000
+	if (ThisTimeLineID == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
+#endif
+
+	/*
+	 * We assume here that we're logging enough information in the WAL for
+	 * log-shipping, since this is checked in PostmasterMain().
+	 *
+	 * NOTE: wal_level can only change at shutdown, so in most cases it is
+	 * difficult for there to be WAL data that we can still see that was
+	 * written at wal_level='minimal'.
+	 */
+
+	if (cmd->slotname)
+	{
+		ReplicationSlotAcquire(cmd->slotname, true);
+		if (SlotIsLogical(MyReplicationSlot))
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("cannot use a logical replication slot for physical replication")));
+
+		/*
+		 * We don't need to verify the slot's restart_lsn here; instead we
+		 * rely on the caller requesting the starting point to use.  If the
+		 * WAL segment doesn't exist, we'll fail later.
+		 */
+	}
+
+	/*
+	 * Select the timeline. If it was given explicitly by the client, use
+	 * that. Otherwise use the timeline of the last replayed record, which is
+	 * kept in ThisTimeLineID.
+	 *
+	 * Neon doesn't currently use PG Timelines, but it may in the future, so
+	 * we keep this code around to lighten the load for when we need it.
+	 */
+#if PG_VERSION_NUM >= 150000
+	FlushPtr = GetFlushRecPtr(&currTLI);
+#else
+	FlushPtr = GetFlushRecPtr();
+	currTLI = ThisTimeLineID;
+#endif
+
+	/*
+	 * When we first start replication the standby will be behind the
+	 * primary. For some applications, for example synchronous
+	 * replication, it is important to have a clear state for this initial
+	 * catchup mode, so we can trigger actions when we change streaming
+	 * state later. We may stay in this state for a long time, which is
+	 * exactly why we want to be able to monitor whether or not we are
+	 * still here.
+	 */
+	WalSndSetState(WALSNDSTATE_CATCHUP);
+
+	/*
+	 * Don't allow a request to stream from a future point in WAL that
+	 * hasn't been flushed to disk in this server yet.
+	 */
+	if (FlushPtr < cmd->startpoint)
+	{
+		ereport(ERROR,
+				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+						LSN_FORMAT_ARGS(cmd->startpoint),
+						LSN_FORMAT_ARGS(FlushPtr))));
+	}
+
+	/* Start streaming from the requested point */
+	sentPtr = cmd->startpoint;
+
+	/* Initialize shared memory status, too */
+	SpinLockAcquire(&MyWalSnd->mutex);
+	MyWalSnd->sentPtr = sentPtr;
+	SpinLockRelease(&MyWalSnd->mutex);
+
+	SyncRepInitConfig();
+
+	/* Infinite send loop, never returns */
+	WalSndLoop();
+
+	WalSndSetState(WALSNDSTATE_STARTUP);
+
+	if (cmd->slotname)
+		ReplicationSlotRelease();
+}
+
+/*
+ * Main loop that waits for LSN updates and calls the walproposer.
+ * Synchronous replication sets latch in WalSndWakeup at walsender.c
+ */
+static void
+WalSndLoop(void)
+{
+	/* Clear any already-pending wakeups */
+	ResetLatch(MyLatch);
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		XLogBroadcastWalProposer();
+
+		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+			WalSndSetState(WALSNDSTATE_STREAMING);
+		WalProposerPoll();
+	}
+}
+
+/*
+ * Notify walproposer about the new WAL position.
+ */
+static void
+XLogBroadcastWalProposer(void)
+{
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+
+	/* Start from the last sent position */
+	startptr = sentPtr;
+
+	/*
+	 * Streaming the current timeline on a primary.
+	 *
+	 * Attempt to send all data that's already been written out and
+	 * fsync'd to disk.  We cannot go further than what's been written out
+	 * given the current implementation of WALRead().  And in any case
+	 * it's unsafe to send WAL that is not securely down to disk on the
+	 * primary: if the primary subsequently crashes and restarts, standbys
+	 * must not have applied any WAL that got lost on the primary.
+	 */
+#if PG_VERSION_NUM >= 150000
+	endptr = GetFlushRecPtr(NULL);
+#else
+	endptr = GetFlushRecPtr();
+#endif
+
+	/*
+	 * Record the current system time as an approximation of the time at which
+	 * this WAL location was written for the purposes of lag tracking.
+	 *
+	 * In theory we could make XLogFlush() record a time in shmem whenever WAL
+	 * is flushed and we could get that time as well as the LSN when we call
+	 * GetFlushRecPtr() above (and likewise for the cascading standby
+	 * equivalent), but rather than putting any new code into the hot WAL path
+	 * it seems good enough to capture the time here.  We should reach this
+	 * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
+	 * may take some time, we read the WAL flush pointer and take the time
+	 * very close to together here so that we'll get a later position if it is
+	 * still moving.
+	 *
+	 * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
+	 * this gives us a cheap approximation for the WAL flush time for this
+	 * LSN.
+	 *
+	 * Note that the LSN is not necessarily the LSN for the data contained in
+	 * the present message; it's the end of the WAL, which might be further
+	 * ahead.  All the lag tracking machinery cares about is finding out when
+	 * that arbitrary LSN is eventually reported as written, flushed and
+	 * applied, so that it can measure the elapsed time.
+	 */
+	LagTrackerWrite(endptr, GetCurrentTimestamp());
+
+	/* Do we have any work to do? */
+	Assert(startptr <= endptr);
+	if (endptr <= startptr)
+		return;
+
+	WalProposerBroadcast(startptr, endptr);
+	sentPtr = endptr;
+
+	/* Update shared memory status */
+	{
+		WalSnd	   *walsnd = MyWalSnd;
+
+		SpinLockAcquire(&walsnd->mutex);
+		walsnd->sentPtr = sentPtr;
+		SpinLockRelease(&walsnd->mutex);
+	}
+
+	/* Report progress of XLOG streaming in PS display */
+	if (update_process_title)
+	{
+		char		activitymsg[50];
+
+		snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
+				 LSN_FORMAT_ARGS(sentPtr));
+		set_ps_display(activitymsg);
+	}
+}
--- a/pgxn/neon/walproposer_utils.h
+++ b/pgxn/neon/walproposer_utils.h
@@ -0,0 +1,19 @@
+#ifndef __NEON_WALPROPOSER_UTILS_H__
+#define __NEON_WALPROPOSER_UTILS_H__
+
+#include "walproposer.h"
+
+int			CompareLsn(const void *a, const void *b);
+char	   *FormatSafekeeperState(SafekeeperState state);
+void		AssertEventsOkForState(uint32 events, Safekeeper *sk);
+uint32		SafekeeperStateDesiredEvents(SafekeeperState state);
+char	   *FormatEvents(uint32 events);
+bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32		pq_getmsgint32_le(StringInfo msg);
+uint64		pq_getmsgint64_le(StringInfo msg);
+void		pq_sendint32_le(StringInfo buf, uint32 i);
+void		pq_sendint64_le(StringInfo buf, uint64 i);
+void		XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
+void		XLogWalPropClose(XLogRecPtr recptr);
+
+#endif							/* __NEON_WALPROPOSER_UTILS_H__ */
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1085,32 +1085,15 @@ class AbstractNeonCli(abc.ABC):
            stderr=subprocess.PIPE,
            timeout=timeout,
        )
-
-        indent = "  "
        if not res.returncode:
-            stripped = res.stdout.strip()
-            lines = stripped.splitlines()
-            if len(lines) < 2:
-                log.debug(f"Run {res.args} success: {stripped}")
-            else:
-                log.debug("Run %s success:\n%s" % (res.args, textwrap.indent(stripped, indent)))
+            log.info(f"Run {res.args} success: {res.stdout}")
        elif check_return_code:
            # this way command output will be in recorded and shown in CI in failure message
-            indent = indent * 2
-            msg = textwrap.dedent(
-                """\
-            Run %s failed:
-              stdout:
-            %s
-              stderr:
-            %s
+            msg = f"""\
+            Run {res.args} failed:
+              stdout: {res.stdout}
+              stderr: {res.stderr}
            """
-            )
-            msg = msg % (
-                res.args,
-                textwrap.indent(res.stdout.strip(), indent),
-                textwrap.indent(res.stderr.strip(), indent),
-            )
            log.info(msg)
            raise RuntimeError(msg) from subprocess.CalledProcessError(
                res.returncode, res.args, res.stdout, res.stderr
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -116,10 +116,6 @@ def get_deletion_queue_submitted(ps_http) -> int:
    return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")


-def get_deletion_queue_validated(ps_http) -> int:
-    return get_metric_or_0(ps_http, "pageserver_deletion_queue_validated_total")
-
-
 def get_deletion_queue_dropped(ps_http) -> int:
    return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")

@@ -277,15 +273,12 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):


@pytest.mark.parametrize("keep_attachment", [True, False])
-@pytest.mark.parametrize("validate_before", [True, False])
 def test_deletion_queue_recovery(
-    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool
+    neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
 ):
    """
    :param keep_attachment: If true, we re-attach after restart.  Else, we act as if some other
    node took the attachment while we were restarting.
-    :param validate_before: If true, we wait for deletions to be validated before restart.  This
-    makes them elegible to be executed after restart, if the same node keeps the attachment.
    """
    neon_env_builder.enable_generations = True
    neon_env_builder.enable_pageserver_remote_storage(
@@ -295,20 +288,12 @@ def test_deletion_queue_recovery(

    ps_http = env.pageserver.http_client()

-    failpoints = [
-        # Prevent deletion lists from being executed, to build up some backlog of deletions
-        ("deletion-queue-before-execute", "return"),
-    ]
-
-    if not validate_before:
-        failpoints.append(
-            # Prevent deletion lists from being validated, we will test that they are
-            # dropped properly during recovery.  'pause' is okay here because we kill
-            # the pageserver with immediate=true
-            ("control-plane-client-validate", "pause")
-        )
-
-    ps_http.configure_failpoints(failpoints)
+    # Prevent deletion lists from being executed, to build up some backlog of deletions
+    ps_http.configure_failpoints(
+        [
+            ("deletion-queue-before-execute", "return"),
+        ]
+    )

    generate_uploads_and_deletions(env)

@@ -320,16 +305,6 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

-    if validate_before:
-
-        def assert_validation_complete():
-            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
-
-        wait_until(20, 1, assert_validation_complete)
-        # A short wait to let the DeletionHeader get written out, as this happens after
-        # the validated count gets incremented.
-        time.sleep(1)
-
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

@@ -352,17 +327,14 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment or validate_before:
-        # - If we kept the attachment, then our pre-restart deletions should execute
-        #   because on re-attach they were from the immediately preceding generation
-        # - If we validated before restart, then the deletions should execute because the
-        #   deletion queue header records a validated deletion list sequence number.
+    if keep_attachment:
+        # If we kept the attachment, then our pre-restart deletions should have executed
+        # successfully
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
-
        # If we lost the attachment, we should have dropped our pre-restart deletions.
        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
+        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

    assert get_deletion_queue_unexpected_errors(ps_http) == 0
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
Author	SHA1	Message	Date
Christian Schwarz	1863a04fb0	WIP	2023-10-05 18:21:18 +02:00
Christian Schwarz	f83a71ca6a	WIP	2023-10-05 18:13:54 +02:00
Christian Schwarz	74a634c9fa	WIP	2023-10-05 18:06:26 +02:00
Christian Schwarz	fc3f8a65b3	WIP: provide permits in requestcontext	2023-10-05 18:02:22 +02:00
Christian Schwarz	9f03dd24c2	page_cache: find_victim: prevent starvation	2023-10-05 16:54:02 +02:00
Christian Schwarz	dc96a7604a	page_cache: ensure forward progress on cache miss	2023-10-05 16:51:08 +02:00
Christian Schwarz	d7c94e67ce	inline lock_for_write and try_lock_for_write into memorize_materialized_page Motivation ========== It's the only user, and the name of `_for_write` is wrong as of commit `7a63685cde` Author: Christian Schwarz <christian@neon.tech> Date: Fri Aug 18 19:31:03 2023 +0200 simplify page-caching of EphemeralFile (#4994) Notes ===== This also allows us to get rid of the WriteBufResult type. Also rename `search_mapping_for_write` to `search_mapping_exact`. It makes more sense that way because there is `_for_write`-locking anymore.	2023-10-05 16:01:29 +02:00