From 80262e724fcb8081c441b440aa3e7dce0ab11d4d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 26 Oct 2024 08:24:15 +0100
Subject: [PATCH 01/27] build(deps): bump werkzeug from 3.0.3 to 3.0.6 (#9527)

---
 poetry.lock    | 10 +++++-----
 pyproject.toml |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/poetry.lock b/poetry.lock
index e307b873f3..7abd794235 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -3118,13 +3118,13 @@ files = [
 
 [[package]]
 name = "werkzeug"
-version = "3.0.3"
+version = "3.0.6"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
-    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
+    {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
+    {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
 ]
 
 [package.dependencies]
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
+content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
diff --git a/pyproject.toml b/pyproject.toml
index 862ed49638..d4926cfb9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^3.0.3"
+Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"

From e7277885b3e95e41f3e2cab6c52d4e9e3981e27d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 26 Oct 2024 15:27:57 +0200
Subject: [PATCH 02/27] Don't consider archived timelines for synthetic size
 calculation (#9497)

Archived timelines should not count towards synthetic size.

Closes #9384.

Part of #8088.
---
 pageserver/src/tenant/size.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 4a4c698b56..6c3276ea3c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
     // but it is unlikely to cause any issues. In the worst case,
     // the calculation will error out.
     timelines.retain(|t| t.is_active());
+    // Also filter out archived timelines.
+    timelines.retain(|t| t.is_archived() != Some(true));
 
     // Build a map of branch points.
     let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();

From 923974d4da4f6f0df754f598149c7679aab0dad2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 08:47:12 +0000
Subject: [PATCH 03/27] safekeeper: don't un-evict timelines during snapshot
 API handler (#9428)

## Problem

When we use pull_timeline API on an evicted timeline, it gets downloaded
to serve the snapshot API request. That means that to evacuate all the
timelines from a node, the node needs enough disk space to download
partial segments from all timelines, which may not be physically the
case.

Closes: #8833

## Summary of changes

- Add a "try" variant of acquiring a residence guard, that returns None
if the timeline is offloaded
- During snapshot API handler, take a different code path if the
timeline isn't resident, where we just read the checkpoint and don't try
to read any segments.
---
 safekeeper/src/http/routes.rs            |   8 --
 safekeeper/src/pull_timeline.rs          | 140 ++++++++++++++++++++---
 safekeeper/src/timeline.rs               |  59 +++++++---
 safekeeper/src/timeline_eviction.rs      |   3 +
 safekeeper/src/timeline_manager.rs       |  27 +++++
 test_runner/regress/test_wal_acceptor.py | 103 +++++++++++++++++
 6 files changed, 298 insertions(+), 42 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index b4590fe3e5..df68f8a68e 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     check_permission(&request, Some(ttid.tenant_id))?;
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    // Note: with evicted timelines it should work better then de-evict them and
-    // stream; probably start_snapshot would copy partial s3 file to dest path
-    // and stream control file, or return WalResidentTimeline if timeline is not
-    // evicted.
-    let tli = tli
-        .wal_residence_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;
 
     // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
     // so create the chan and write to it in another task.
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c7f5165f90..c700e18cc7 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
     io::{self, ErrorKind},
+    sync::Arc,
 };
 use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
 use tokio_tar::{Archive, Builder, Header};
@@ -25,8 +26,8 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
-    state::TimelinePersistentState,
-    timeline::WalResidentTimeline,
+    state::{EvictionState, TimelinePersistentState},
+    timeline::{Timeline, WalResidentTimeline},
     timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup,
     wal_storage::open_wal_file,
@@ -43,18 +44,33 @@ use utils::{
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
 pub async fn stream_snapshot(
-    tli: WalResidentTimeline,
+    tli: Arc<Timeline>,
     source: NodeId,
     destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
 ) {
-    if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
-        // Error type/contents don't matter as they won't can't reach the client
-        // (hyper likely doesn't do anything with it), but http stream will be
-        // prematurely terminated. It would be nice to try to send the error in
-        // trailers though.
-        tx.send(Err(anyhow!("snapshot failed"))).await.ok();
-        error!("snapshot failed: {:#}", e);
+    match tli.try_wal_residence_guard().await {
+        Err(e) => {
+            tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
+                .await
+                .ok();
+        }
+        Ok(maybe_resident_tli) => {
+            if let Err(e) = match maybe_resident_tli {
+                Some(resident_tli) => {
+                    stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
+                        .await
+                }
+                None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
+            } {
+                // Error type/contents don't matter as they won't can't reach the client
+                // (hyper likely doesn't do anything with it), but http stream will be
+                // prematurely terminated. It would be nice to try to send the error in
+                // trailers though.
+                tx.send(Err(anyhow!("snapshot failed"))).await.ok();
+                error!("snapshot failed: {:#}", e);
+            }
+        }
     }
 }
 
@@ -80,12 +96,10 @@ impl Drop for SnapshotContext {
     }
 }
 
-pub async fn stream_snapshot_guts(
-    tli: WalResidentTimeline,
-    source: NodeId,
-    destination: NodeId,
+/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
+fn prepare_tar_stream(
     tx: mpsc::Sender<Result<Bytes>>,
-) -> Result<()> {
+) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
     // use SinkWriter as a Write impl. That is,
     // - create Sink from the tx. It returns PollSendError if chan is closed.
@@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts(
     // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
     // into CopyToBytes. This is a data copy.
     let copy_to_bytes = CopyToBytes::new(oksink);
-    let mut writer = SinkWriter::new(copy_to_bytes);
-    let pinned_writer = std::pin::pin!(writer);
+    let writer = SinkWriter::new(copy_to_bytes);
+    let pinned_writer = Box::pin(writer);
 
     // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
     // which is also likely suboptimal.
-    let mut ar = Builder::new_non_terminated(pinned_writer);
+    Builder::new_non_terminated(pinned_writer)
+}
+
+/// Implementation of snapshot for an offloaded timeline, only reads control file
+pub(crate) async fn stream_snapshot_offloaded_guts(
+    tli: Arc<Timeline>,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) -> Result<()> {
+    let mut ar = prepare_tar_stream(tx);
+
+    tli.snapshot_offloaded(&mut ar, source, destination).await?;
+
+    ar.finish().await?;
+
+    Ok(())
+}
+
+/// Implementation of snapshot for a timeline which is resident (includes some segment data)
+pub async fn stream_snapshot_resident_guts(
+    tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) -> Result<()> {
+    let mut ar = prepare_tar_stream(tx);
 
     let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
@@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts(
     Ok(())
 }
 
+impl Timeline {
+    /// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
+    /// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
+    /// we are offloaded and there aren't any)
+    async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
+        self: &Arc<Timeline>,
+        ar: &mut tokio_tar::Builder<W>,
+        source: NodeId,
+        destination: NodeId,
+    ) -> Result<()> {
+        // Take initial copy of control file, then release state lock
+        let mut control_file = {
+            let shared_state = self.write_shared_state().await;
+
+            let control_file = TimelinePersistentState::clone(shared_state.sk.state());
+
+            // Rare race: we got unevicted between entering function and reading control file.
+            // We error out and let API caller retry.
+            if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
+                bail!("Timeline was un-evicted during snapshot, please retry");
+            }
+
+            control_file
+        };
+
+        // Modify the partial segment of the in-memory copy for the control file to
+        // point to the destination safekeeper.
+        let replace = control_file
+            .partial_backup
+            .replace_uploaded_segment(source, destination)?;
+
+        let Some(replace) = replace else {
+            // In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
+            // has a partial segment.  It is unexpected that
+            anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
+        };
+
+        tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
+
+        // Optimistically try to copy the partial segment to the destination's path: this
+        // can fail if the timeline was un-evicted and modified in the background.
+        let remote_timeline_path = &self.remote_path;
+        wal_backup::copy_partial_segment(
+            &replace.previous.remote_path(remote_timeline_path),
+            &replace.current.remote_path(remote_timeline_path),
+        )
+        .await?;
+
+        // Since the S3 copy succeeded with the path given in our control file snapshot, and
+        // we are sending that snapshot in our response, we are giving the caller a consistent
+        // snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
+        let buf = control_file
+            .write_to_buf()
+            .with_context(|| "failed to serialize control store")?;
+        let mut header = Header::new_gnu();
+        header.set_size(buf.len().try_into().expect("never breaches u64"));
+        ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
+            .await
+            .with_context(|| "failed to append to archive")?;
+
+        Ok(())
+    }
+}
+
 impl WalResidentTimeline {
     /// Start streaming tar archive with timeline:
     /// 1) stream control file under lock;
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index c737dfcf9b..f0113978c4 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -797,14 +797,17 @@ impl Timeline {
         state.sk.term_bump(to).await
     }
 
-    /// Get the timeline guard for reading/writing WAL files.
-    /// If WAL files are not present on disk (evicted), they will be automatically
-    /// downloaded from remote storage. This is done in the manager task, which is
-    /// responsible for issuing all guards.
-    ///
-    /// NB: don't use this function from timeline_manager, it will deadlock.
-    /// NB: don't use this function while holding shared_state lock.
-    pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
+    /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
+    async fn do_wal_residence_guard(
+        self: &Arc<Self>,
+        block: bool,
+    ) -> Result<Option<WalResidentTimeline>> {
+        let op_label = if block {
+            "wal_residence_guard"
+        } else {
+            "try_wal_residence_guard"
+        };
+
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -816,10 +819,13 @@ impl Timeline {
         // Wait 30 seconds for the guard to be acquired. It can time out if someone is
         // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
         // is stuck.
-        let res = tokio::time::timeout_at(
-            started_at + Duration::from_secs(30),
-            self.manager_ctl.wal_residence_guard(),
-        )
+        let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
+            if block {
+                self.manager_ctl.wal_residence_guard().await.map(Some)
+            } else {
+                self.manager_ctl.try_wal_residence_guard().await
+            }
+        })
         .await;
 
         let guard = match res {
@@ -827,14 +833,14 @@ impl Timeline {
                 let finished_at = Instant::now();
                 let elapsed = finished_at - started_at;
                 MISC_OPERATION_SECONDS
-                    .with_label_values(&["wal_residence_guard"])
+                    .with_label_values(&[op_label])
                     .observe(elapsed.as_secs_f64());
 
                 guard
             }
             Ok(Err(e)) => {
                 warn!(
-                    "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    "error acquiring in {op_label}, statuses {:?} => {:?}",
                     status_before,
                     self.mgr_status.get()
                 );
@@ -842,7 +848,7 @@ impl Timeline {
             }
             Err(_) => {
                 warn!(
-                    "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    "timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
                     status_before,
                     self.mgr_status.get()
                 );
@@ -850,7 +856,28 @@ impl Timeline {
             }
         };
 
-        Ok(WalResidentTimeline::new(self.clone(), guard))
+        Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
+    }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// If WAL files are not present on disk (evicted), they will be automatically
+    /// downloaded from remote storage. This is done in the manager task, which is
+    /// responsible for issuing all guards.
+    ///
+    /// NB: don't use this function from timeline_manager, it will deadlock.
+    /// NB: don't use this function while holding shared_state lock.
+    pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
+        self.do_wal_residence_guard(true)
+            .await
+            .map(|m| m.expect("Always get Some in block=true mode"))
+    }
+
+    /// Get the timeline guard for reading/writing WAL files if the timeline is resident,
+    /// else return None
+    pub(crate) async fn try_wal_residence_guard(
+        self: &Arc<Self>,
+    ) -> Result<Option<WalResidentTimeline>> {
+        self.do_wal_residence_guard(false).await
     }
 
     pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index f5363ae9b0..303421c837 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -56,6 +56,9 @@ impl Manager {
             // This also works for the first segment despite last_removed_segno
             // being 0 on init because this 0 triggers run of wal_removal_task
             // on success of which manager updates the horizon.
+            //
+            // **Note** pull_timeline functionality assumes that evicted timelines always have
+            // a partial segment: if we ever change this condition, must also update that code.
             && self
                 .partial_backup_uploaded
                 .as_ref()
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f0583dd3ff..79200fff8d 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 pub enum ManagerCtlMessage {
     /// Request to get a guard for WalResidentTimeline, with WAL files available locally.
     GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
+    /// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
+    TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
     /// Request to drop the guard.
     GuardDrop(GuardId),
     /// Request to reset uploaded partial backup state.
@@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
+            ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
             ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
             ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
         }
@@ -152,6 +155,19 @@ impl ManagerCtl {
             .and_then(std::convert::identity)
     }
 
+    /// Issue a new guard if the timeline is currently not offloaded, else return None
+    /// Sends a message to the manager and waits for the response.
+    /// Can be blocked indefinitely if the manager is stuck.
+    pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        self.manager_tx
+            .send(ManagerCtlMessage::TryGuardRequest(tx))?;
+
+        // wait for the manager to respond with the guard
+        rx.await
+            .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
+    }
+
     /// Request timeline manager to reset uploaded partial segment state and
     /// wait for the result.
     pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
@@ -674,6 +690,17 @@ impl Manager {
                     warn!("failed to reply with a guard, receiver dropped");
                 }
             }
+            Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
+                let result = if self.is_offloaded {
+                    None
+                } else {
+                    Some(self.access_service.create_guard())
+                };
+
+                if tx.send(result).is_err() {
+                    warn!("failed to reply with a guard, receiver dropped");
+                }
+            }
             Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
                 self.access_service.drop_guard(guard_id);
             }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d803cd7c78..157390c01c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1998,6 +1998,109 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
         pt_handle.join()
 
 
+def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that when pull_timeline is used on an evicted timeline, it does not result in
+    promoting any segments to local disk on the source, and the timeline is correctly instantiated
+    in evicted state on the destination.  This behavior is important to avoid ballooning disk
+    usage when doing mass migration of timelines.
+    """
+    neon_env_builder.num_safekeepers = 4
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+
+    # Configure safekeepers with ultra-fast eviction policy
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "50ms",
+        "--control-file-save-interval",
+        "1s",
+        # Safekeepers usually wait a while before evicting something: for this test we want them to
+        # evict things as soon as they are inactive.
+        "--eviction-min-resident=100ms",
+        "--delete-offloaded-wal",
+    ]
+
+    initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
+    env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1])
+    log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}")
+
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id]
+    log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}")
+    ep.active_safekeepers = [1, 2, 3]  # Exclude dst_sk from set written by compute initially
+    ep.start()
+    ep.safe_psql("CREATE TABLE t(i int)")
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+    ep.stop()
+
+    wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver)
+
+    src_http = src_sk.http_client()
+    dst_http = dst_sk.http_client()
+
+    def evicted_on_source():
+        # Wait for timeline to go into evicted state
+        assert src_http.get_eviction_state(timeline_id) != "Present"
+        assert (
+            src_http.get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+            )
+            or 0 > 0
+        )
+        assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
+        # Check that on source no segment files are present
+        assert src_sk.list_segments(tenant_id, timeline_id) == []
+
+    wait_until(60, 1, evicted_on_source)
+
+    # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
+    # destination should import the control file only & go into evicted mode immediately
+    dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
+
+    # Check that on source and destination no segment files are present
+    assert src_sk.list_segments(tenant_id, timeline_id) == []
+    assert dst_sk.list_segments(tenant_id, timeline_id) == []
+
+    # Check that the timeline on the destination is in the expected evicted state.
+    evicted_on_source()  # It should still be evicted on the source
+
+    def evicted_on_destination():
+        assert dst_http.get_eviction_state(timeline_id) != "Present"
+        assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
+
+    # This should be fast, it is a wait_until because eviction state is updated
+    # in the background wrt pull_timeline.
+    wait_until(10, 0.1, evicted_on_destination)
+
+    # Delete the timeline on the source, to prove that deletion works on an
+    # evicted timeline _and_ that the final compute test is really not using
+    # the original location
+    src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True)
+
+    # Check that using the timeline correctly un-evicts it on the new location
+    ep.active_safekeepers = [2, 3, 4]
+    ep.start()
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+    ep.stop()
+
+    def unevicted_on_dest():
+        assert (
+            dst_http.get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+            )
+            or 0 > 0
+        )
+        n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
+        assert n_evicted == 0
+
+    wait_until(10, 1, unevicted_on_dest)
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 33baca07b69bf674113d53a4c7f6e53b8e7a3396 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 09:26:01 +0000
Subject: [PATCH 04/27] storcon: add an API to cancel ongoing reconciler
 (#9520)

## Problem

If something goes wrong with a live migration, we currently only have
awkward ways to interrupt that:
- Restart the storage controller
- Ask it to do some other modification/migration on the shard, which we
don't really want.

## Summary of changes

- Add a new `/cancel` control API, and storcon_cli wrapper for it, which
fires the Reconciler's cancellation token. This is just for on-call use
and we do not expect it to be used by any other services.
---
 control_plane/storcon_cli/src/main.rs         | 14 +++++++
 storage_controller/src/http.rs                | 32 ++++++++++++++++
 storage_controller/src/service.rs             | 37 +++++++++++++++++++
 storage_controller/src/tenant_shard.rs        |  6 +++
 .../regress/test_storage_controller.py        | 15 +++++++-
 5 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 73d89699ed..b7f38c6286 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -111,6 +111,11 @@ enum Command {
         #[arg(long)]
         node: NodeId,
     },
+    /// Cancel any ongoing reconciliation for this shard
+    TenantShardCancelReconcile {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+    },
     /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
     /// that is passed through to pageservers, and does not affect storage controller behavior.
     TenantConfig {
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
+        Command::TenantShardCancelReconcile { tenant_shard_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
+                    None,
+                )
+                .await?;
+        }
         Command::TenantConfig { tenant_id, config } => {
             let tenant_conf = serde_json::from_str(&config)?;
 
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index afefe8598c..face3d2c2d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_shard_cancel_reconcile(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_shard_cancel_reconcile(tenant_shard_id)
+            .await?,
+    )
+}
+
 async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1776,6 +1798,16 @@ pub fn make_router(
                 RequestName("control_v1_tenant_migrate"),
             )
         })
+        .put(
+            "/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_shard_cancel_reconcile,
+                    RequestName("control_v1_tenant_cancel_reconcile"),
+                )
+            },
+        )
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
             tenant_service_handler(
                 r,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a2a6e63dd2..32029c1232 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4834,6 +4834,43 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    /// 'cancel' in this context means cancel any ongoing reconcile
+    pub(crate) async fn tenant_shard_cancel_reconcile(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), ApiError> {
+        // Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
+        let waiter = {
+            let locked = self.inner.write().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard not found").into(),
+                ));
+            };
+
+            let waiter = shard.get_waiter();
+            match waiter {
+                None => {
+                    tracing::info!("Shard does not have an ongoing Reconciler");
+                    return Ok(());
+                }
+                Some(waiter) => {
+                    tracing::info!("Cancelling Reconciler");
+                    shard.cancel_reconciler();
+                    waiter
+                }
+            }
+        };
+
+        // Cancellation should be prompt.  If this fails we have still done our job of firing the
+        // cancellation token, but by returning an ApiError we will indicate to the caller that
+        // the Reconciler is misbehaving and not respecting the cancellation token
+        self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
+            .await?;
+
+        Ok(())
+    }
+
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.
     pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e696c72ba7..27c97d3b86 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1317,6 +1317,12 @@ impl TenantShard {
         })
     }
 
+    pub(crate) fn cancel_reconciler(&self) {
+        if let Some(handle) = self.reconciler.as_ref() {
+            handle.cancel.cancel()
+        }
+    }
+
     /// Get a waiter for any reconciliation in flight, but do not start reconciliation
     /// if it is not already running
     pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d4bc4b1a4f..40fee7661a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -872,6 +872,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
+    # Reconciler cancel API should be a no-op when nothing is in flight
+    env.storage_controller.request(
+        "PUT",
+        f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+    )
+
+    # Node unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
@@ -879,6 +887,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     )
     assert len(env.storage_controller.node_list()) == 1
 
+    # Tenant unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
@@ -892,7 +901,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 1
-
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
     env.storage_controller.consistency_check()
@@ -1660,6 +1668,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
     assert "Stop" in storcon_cli(["tenants"])[3]
 
+    # Cancel ongoing reconcile on a tenant
+    storcon_cli(
+        ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
+    )
+
     # Change a tenant's placement
     storcon_cli(
         ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]

From 93987b5a4a1defe6d6e99a8e63c3652b26eace1f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 11:11:12 +0000
Subject: [PATCH 05/27] tests: add test_storage_controller_onboard_detached
 (#9431)

## Problem

We haven't historically taken this API route where we would onboard a
tenant to the controller in detached state. It worked, but we didn't
have test coverage.

## Summary of changes

- Add a test that onboards a tenant to the storage controller in
Detached mode, and checks that deleting it without attaching it works as
expected.
---
 .../regress/test_storage_controller.py        | 98 +++++++++++++++++--
 1 file changed, 91 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 40fee7661a..c8de292588 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -18,6 +18,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PageserverAvailability,
     PageserverSchedulingPolicy,
     PgBin,
@@ -298,17 +299,20 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
-@pytest.mark.parametrize("warm_up", [True, False])
-def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+def prepare_onboarding_env(
+    neon_env_builder: NeonEnvBuilder,
+) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
     """
-    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
-    which provides the /location_config API.  This is similar to creating a tenant,
-    but imports the generation number.
+    For tests that do onboarding of a tenant to the storage controller, a small dance to
+    set up one pageserver that won't be managed by the storage controller and create
+    a tenant there.
     """
-
     # One pageserver to simulate legacy environment, two to be managed by storage controller
     neon_env_builder.num_pageservers = 3
 
+    # Enable tests to use methods that require real S3 API
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
     env.broker.start()
@@ -329,7 +333,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     # will be attached after onboarding
     env.pageservers[1].start()
     env.pageservers[2].start()
-    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
         sk.start()
@@ -339,6 +342,23 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     generation = 123
     origin_ps.tenant_create(tenant_id, generation=generation)
 
+    origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
+
+    return (env, origin_ps, tenant_id, generation)
+
+
+@pytest.mark.parametrize("warm_up", [True, False])
+def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+    """
+    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
+    which provides the /location_config API.  This is similar to creating a tenant,
+    but imports the generation number.
+    """
+
+    env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
     # As if doing a live migration, first configure origin into stale mode
     r = origin_ps.http_client().tenant_location_conf(
         tenant_id,
@@ -475,6 +495,70 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("this test doesn't start an endpoint")
+def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
+    """
+    Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
+    and also wasn't ever registered with the storage controller.
+
+    It may do this by calling /location_conf in mode Detached and then calling the delete API
+    as normal.
+    """
+
+    env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
+
+    remote_prefix = "/".join(
+        (
+            "tenants",
+            str(tenant_id),
+        )
+    )
+
+    # Detach it from its original pageserver.
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    # Since we will later assert that remote data is gone, as a control also check it was ever there
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+    # Register with storage controller in Detached state
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+    generation += 1
+    r = virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+    assert len(r["shards"]) == 0  # location_conf tells us there are no attached shards
+
+    # Onboarding in Detached state shouldn't have attached it to any pageserver
+    for ps in env.pageservers:
+        assert ps.http_client().tenant_list() == []
+
+    # Delete it via the storage controller
+    virtual_ps_http.tenant_delete(tenant_id)
+
+    # Check that we really deleted it
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+
 def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,

From 01b6843e1214496343d3401081e4bede17d8a025 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 28 Oct 2024 12:09:47 +0000
Subject: [PATCH 06/27] Route pgbouncer logs to virtio-serial (#9488)

virtio-serial is much more performant than /dev/console emulation,
therefore, is much more suitable for the verbose logs inside vm. This
commit changes routing for pgbouncer logs, since we've recently noticed
it can emit large volumes of logs.

Manually tested on staging by pinning a compute image to my test
project.

Should help with https://github.com/neondatabase/cloud/issues/19072
---
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 51a55b513f..79f894c289 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -18,7 +18,7 @@ commands:
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 43e57a4ed5..ff04b9e4c6 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -18,7 +18,7 @@ commands:
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn

From 8dd555d3964ae28cabec65a7e59a7047b47bac25 Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Mon, 28 Oct 2024 13:17:09 +0100
Subject: [PATCH 07/27] ci(proxy): Update GH action flag on proxy deployment
 (#9535)

## Problem

Based on a recent proxy deployment issue, we deployed another proxy
version (proxy-scram), which was not needed when deploying a specific
proxy type. we have
[PR](https://github.com/neondatabase/infra/pull/2142) to update on the
infra branch and need to update CI in this repo which triggers proxy
deployment.

## Summary of changes

- Update proxy deployment flag

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/build_and_test.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0d3ea7db28..82a24b29d3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1116,7 +1116,11 @@ jobs:
 
             gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
-              -f deployProxy=true \
+              -f deployProxyLink=true \
+              -f deployPrivatelinkProxy=true \
+              -f deployLegacyProxyScram=true \
+              -f deployProxyScram=true \
+              -f deployProxyAuthBroker=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           else

From 25f1e5cfebe865e0c7126f0b0e0ca9e00be0731b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 28 Oct 2024 15:02:20 +0000
Subject: [PATCH 08/27] [proxy] demote warnings and remove dead-argument
 (#9512)

fixes https://github.com/neondatabase/cloud/issues/19000
---
 proxy/src/auth/backend/hacks.rs | 6 +++---
 proxy/src/bin/proxy.rs          | 6 ------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 8ab8d5d37f..28bdacd769 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info};
 
 use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::auth::{self, AuthFlow};
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
     secret: AuthSecret,
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
-    warn!("cleartext auth flow override is enabled, proceeding");
+    debug!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
-    warn!("project not specified, resorting to the password hack auth flow");
+    debug!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 6e190029aa..82c259efc8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -137,9 +137,6 @@ struct ProxyCliArgs {
     /// size of the threadpool for password hashing
     #[clap(long, default_value_t = 4)]
     scram_thread_pool_size: u8,
-    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_dynamic_rate_limiter: bool,
     /// Endpoint rate limiter max number of requests per second.
     ///
     /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
              and metric-collection-interval must be specified"
         ),
     };
-    if !args.disable_dynamic_rate_limiter {
-        bail!("dynamic rate limiter should be disabled");
-    }
 
     let config::ConcurrencyLockOptions {
         shards,

From 3d64a7ddcdf23f8eefc343258438c91251d58488 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Oct 2024 11:23:30 -0500
Subject: [PATCH 09/27] Add pg_mooncake to compute-node.Dockerfile

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml |  1 +
 compute/compute-node.Dockerfile      | 37 ++++++++++++++++++++++++----
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 82a24b29d3..c308c41efd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -839,6 +839,7 @@ jobs:
       - name: Build vm image
         run: |
           ./vm-builder \
+            -size=2G \
             -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
             -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
             -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6451e309f0..dfed01daae 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -666,7 +666,7 @@ RUN apt-get update && \
 #
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
-# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 
+# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN case "${PG_VERSION}" in \
     "v17") \
@@ -860,13 +860,14 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
+    case "${PG_VERSION}" in \
+        'v17') \
+            echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
+    esac && \
     cargo install --locked --version 0.11.3 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
@@ -1041,6 +1042,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
+#########################################################################################
+#
+# Layer "pg_mooncake"
+# compile pg_mooncake extension
+#
+#########################################################################################
+FROM rust-extensions-build AS pg-mooncake-build
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+
+RUN case "${PG_VERSION}" in \
+        'v14') \
+            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
+    esac && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    cd pg_mooncake-src && \
+    git checkout "${PG_MOONCAKE_VERSION}" && \
+    git submodule update --init --depth 1 --recursive && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -1084,6 +1110,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 3bad52543fa018a11beded31885d95150e6f907a Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 28 Oct 2024 17:42:35 +0100
Subject: [PATCH 10/27] We don't have legacy proxies anymore (#9544)

We don't have legacy scram proxies anymore:
cc: https://github.com/neondatabase/cloud/issues/9745
---
 .github/workflows/build_and_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c308c41efd..bba51ddc92 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1119,7 +1119,6 @@ jobs:
               -f deployPgSniRouter=true \
               -f deployProxyLink=true \
               -f deployPrivatelinkProxy=true \
-              -f deployLegacyProxyScram=true \
               -f deployProxyScram=true \
               -f deployProxyAuthBroker=true \
               -f branch=main \

From 248558dee85849fd95fece7f8e0a730c14eb0660 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 28 Oct 2024 18:18:37 +0100
Subject: [PATCH 11/27] safekeeper: refactor `WalAcceptor` to be event-driven
 (#9462)

## Problem

The `WalAcceptor` main loop currently uses two nested loops to consume
inbound messages. This makes it hard to slot in periodic events like
metrics collection. It also duplicates the event processing code, and assumes
all messages in steady state are AppendRequests (other messages types may
be dropped if following an AppendRequest).

## Summary of changes

Refactor the `WalAcceptor` loop to be event driven.
---
 safekeeper/src/receive_wal.rs | 120 ++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 57 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 3dbf72298f..f97e127a17 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -21,18 +21,15 @@ use postgres_backend::QueryError;
 use pq_proto::BeMessage;
 use serde::Deserialize;
 use serde::Serialize;
+use std::future;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
-use tokio::sync::mpsc::channel;
-use tokio::sync::mpsc::error::TryRecvError;
-use tokio::sync::mpsc::Receiver;
-use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::task;
 use tokio::task::JoinHandle;
-use tokio::time::Duration;
-use tokio::time::Instant;
+use tokio::time::{Duration, MissedTickBehavior};
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -444,9 +441,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     }
 }
 
-// Send keepalive messages to walproposer, to make sure it receives updates
-// even when it writes a steady stream of messages.
-const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
+/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
+/// walproposer, even when it's writing a steady stream of messages.
+const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
 /// replies to reply_tx.
@@ -494,67 +491,76 @@ impl WalAcceptor {
     async fn run(&mut self) -> anyhow::Result<()> {
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
 
-        // After this timestamp we will stop processing AppendRequests and send a response
-        // to the walproposer. walproposer sends at least one AppendRequest per second,
-        // we will send keepalives by replying to these requests once per second.
-        let mut next_keepalive = Instant::now();
+        // Periodically flush the WAL.
+        let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
+        flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        flush_ticker.tick().await; // skip the initial, immediate tick
 
-        while let Some(mut next_msg) = self.msg_rx.recv().await {
-            // Update walreceiver state in shmem for reporting.
-            if let ProposerAcceptorMessage::Elected(_) = &next_msg {
-                walreceiver_guard.get().status = WalReceiverStatus::Streaming;
-            }
+        // Tracks unflushed appends.
+        let mut dirty = false;
 
-            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
-                // Loop through AppendRequests while available to write as many WAL records as
-                // possible without fsyncing.
-                //
-                // Make sure the WAL is flushed before returning, see:
-                // https://github.com/neondatabase/neon/issues/9259
-                //
-                // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
-                // Otherwise, we might end up in a situation where we read a message, but don't
-                // process it.
-                while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
-                    let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
-
-                    if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
-                        if self.reply_tx.send(reply).await.is_err() {
-                            break; // disconnected, flush WAL and return on next send/recv
-                        }
-                    }
-
-                    // get out of this loop if keepalive time is reached
-                    if Instant::now() >= next_keepalive {
+        loop {
+            let reply = tokio::select! {
+                // Process inbound message.
+                msg = self.msg_rx.recv() => {
+                    // If disconnected, break to flush WAL and return.
+                    let Some(mut msg) = msg else {
                         break;
+                    };
+
+                    // Update walreceiver state in shmem for reporting.
+                    if let ProposerAcceptorMessage::Elected(_) = &msg {
+                        walreceiver_guard.get().status = WalReceiverStatus::Streaming;
                     }
 
-                    // continue pulling AppendRequests if available
-                    match self.msg_rx.try_recv() {
-                        Ok(msg) => next_msg = msg,
-                        Err(TryRecvError::Empty) => break,
-                        // on disconnect, flush WAL and return on next send/recv
-                        Err(TryRecvError::Disconnected) => break,
-                    };
+                    // Don't flush the WAL on every append, only periodically via flush_ticker.
+                    // This batches multiple appends per fsync. If the channel is empty after
+                    // sending the reply, we'll schedule an immediate flush.
+                    if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
+                        msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
+                        dirty = true;
+                    }
+
+                    self.tli.process_msg(&msg).await?
                 }
 
-                // flush all written WAL to the disk
-                self.tli
-                    .process_msg(&ProposerAcceptorMessage::FlushWAL)
-                    .await?
-            } else {
-                // process message other than AppendRequest
-                self.tli.process_msg(&next_msg).await?
+                // While receiving AppendRequests, flush the WAL periodically and respond with an
+                // AppendResponse to let walproposer know we're still alive.
+                _ = flush_ticker.tick(), if dirty => {
+                    dirty = false;
+                    self.tli
+                        .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                        .await?
+                }
+
+                // If there are no pending messages, flush the WAL immediately.
+                //
+                // TODO: this should be done via flush_ticker.reset_immediately(), but that's always
+                // delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
+                _ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
+                    dirty = false;
+                    flush_ticker.reset();
+                    self.tli
+                        .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                        .await?
+                }
             };
 
-            if let Some(reply) = reply_msg {
+            // Send reply, if any.
+            if let Some(reply) = reply {
                 if self.reply_tx.send(reply).await.is_err() {
-                    return Ok(()); // chan closed, streaming terminated
+                    break; // disconnected, break to flush WAL and return
                 }
-                // reset keepalive time
-                next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
         }
+
+        // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
+        if dirty {
+            self.tli
+                .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                .await?;
+        }
+
         Ok(())
     }
 }

From 57c21aff9f7a3074292f20efac319e3b248da484 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:51:14 -0400
Subject: [PATCH 12/27] refactor(pageserver): remove aux v1 configs (#9494)

## Problem

Part of https://github.com/neondatabase/neon/issues/8623

## Summary of changes

Removed all aux-v1 config processing code. Note that we persisted it
into the index part file, so we cannot really remove the field from
index part. I also kept the config item within the tenant config, but we
will not read it any more.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |  12 +-
 libs/pageserver_api/src/config.rs             |   7 -
 libs/pageserver_api/src/models.rs             | 129 ------------------
 pageserver/pagebench/src/cmd/aux_files.rs     |   7 +-
 pageserver/src/tenant.rs                      |   1 -
 pageserver/src/tenant/config.rs               |   9 --
 storage_controller/src/service.rs             |  11 +-
 test_runner/fixtures/neon_cli.py              |   9 --
 test_runner/fixtures/neon_fixtures.py         |  12 --
 test_runner/fixtures/parametrize.py           |  11 --
 test_runner/fixtures/utils.py                 |  16 ---
 .../performance/test_logical_replication.py   |   3 +-
 .../regress/test_attach_tenant_config.py      |   1 -
 .../regress/test_logical_replication.py       |  17 ---
 14 files changed, 5 insertions(+), 240 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5b5828c6ed..8df0a714ec 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,7 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
+use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -399,11 +399,6 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
-            switch_aux_file_policy: settings
-                .remove("switch_aux_file_policy")
-                .map(|x| x.parse::<AuxFilePolicy>())
-                .transpose()
-                .context("Failed to parse 'switch_aux_file_policy'")?,
             lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
             lsn_lease_length_for_ts: settings
                 .remove("lsn_lease_length_for_ts")
@@ -499,11 +494,6 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
-                switch_aux_file_policy: settings
-                    .remove("switch_aux_file_policy")
-                    .map(|x| x.parse::<AuxFilePolicy>())
-                    .transpose()
-                    .context("Failed to parse 'switch_aux_file_policy'")?,
                 lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
                 lsn_lease_length_for_ts: settings
                     .remove("lsn_lease_length_for_ts")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 896a5d8069..6b2d6cf625 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
-
     /// The length for an explicit LSN lease request.
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
             lazy_slru_download: false,
             timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
         }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d37f62185c..0a4992aea4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use std::{
     io::{BufRead, Read},
     num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
-    sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
 
@@ -309,7 +308,6 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
 }
@@ -350,68 +348,6 @@ pub enum AuxFilePolicy {
     CrossValidation,
 }
 
-impl AuxFilePolicy {
-    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
-        matches!(
-            (from, to),
-            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
-        )
-    }
-
-    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
-    pub fn default_tenant_config() -> Self {
-        Self::V2
-    }
-}
-
-/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
-pub struct AtomicAuxFilePolicy(AtomicUsize);
-
-impl AtomicAuxFilePolicy {
-    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
-        Self(AtomicUsize::new(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-        ))
-    }
-
-    pub fn load(&self) -> Option<AuxFilePolicy> {
-        match self.0.load(std::sync::atomic::Ordering::Acquire) {
-            0 => None,
-            other => Some(AuxFilePolicy::from_usize(other)),
-        }
-    }
-
-    pub fn store(&self, policy: Option<AuxFilePolicy>) {
-        self.0.store(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-            std::sync::atomic::Ordering::Release,
-        );
-    }
-}
-
-impl AuxFilePolicy {
-    pub fn to_usize(self) -> usize {
-        match self {
-            Self::V1 => 1,
-            Self::CrossValidation => 2,
-            Self::V2 => 3,
-        }
-    }
-
-    pub fn try_from_usize(this: usize) -> Option<Self> {
-        match this {
-            1 => Some(Self::V1),
-            2 => Some(Self::CrossValidation),
-            3 => Some(Self::V2),
-            _ => None,
-        }
-    }
-
-    pub fn from_usize(this: usize) -> Self {
-        Self::try_from_usize(this).unwrap()
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -1633,71 +1569,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_aux_file_migration_path() {
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V1
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V2
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::CrossValidation
-        ));
-        // Self-migration is not a valid migration path, and the caller should handle it by itself.
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations not allowed
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::CrossValidation
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations allowed
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V2
-        ));
-    }
-
-    #[test]
-    fn test_aux_parse() {
-        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(
-            AuxFilePolicy::from_str("cross-validation").unwrap(),
-            AuxFilePolicy::CrossValidation
-        );
-    }
-
     #[test]
     fn test_image_compression_algorithm_parsing() {
         use ImageCompressionAlgorithm::*;
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index bce3285606..923a7f1f18 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::models::{TenantConfig, TenantConfigRequest};
 use pageserver_api::shard::TenantShardId;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     mgmt_api_client
         .tenant_config(&TenantConfigRequest {
             tenant_id: timeline.tenant_id,
-            config: TenantConfig {
-                switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                ..Default::default()
-            },
+            config: TenantConfig::default(),
         })
         .await?;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f846e145c5..64e871cada 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4853,7 +4853,6 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
-                switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 502cb62fe8..ce686c89ef 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,6 @@
 //! may lead to a data loss.
 //!
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -410,9 +405,6 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_aux_file_policy: self
-                .switch_aux_file_policy
-                .unwrap_or(global_conf.switch_aux_file_policy),
             lsn_lease_length: self
                 .lsn_lease_length
                 .unwrap_or(global_conf.lsn_lease_length),
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_aux_file_policy: value.switch_aux_file_policy,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
         }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 32029c1232..3f6cbfef59 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4958,16 +4958,7 @@ impl Service {
                     stripe_size,
                 },
                 placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
-
-                // There is no way to know what the tenant's config was: revert to defaults
-                //
-                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
-                //
-                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
-                config: TenantConfig {
-                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
-                    ..TenantConfig::default()
-                },
+                config: TenantConfig::default(),
             })
             .await?;
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 1b2767e296..d220ea57a2 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -16,7 +16,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.common_types import IndexPartDump
 from fixtures.pg_version import PgVersion
-from fixtures.utils import AuxFileStore
 
 if TYPE_CHECKING:
     from typing import (
@@ -201,7 +200,6 @@ class NeonLocalCli(AbstractNeonCli):
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
-        aux_file_policy: Optional[AuxFileStore] = None,
     ):
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -223,13 +221,6 @@ class NeonLocalCli(AbstractNeonCli):
                 )
             )
 
-        if aux_file_policy is AuxFileStore.V2:
-            args.extend(["-c", "switch_aux_file_policy:v2"])
-        elif aux_file_policy is AuxFileStore.V1:
-            args.extend(["-c", "switch_aux_file_policy:v1"])
-        elif aux_file_policy is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross-validation"])
-
         if set_default:
             args.append("--set-default")
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6491069f20..a8ec144fe9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -94,7 +94,6 @@ from fixtures.utils import (
     subprocess_capture,
     wait_until,
 )
-from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 from .neon_api import NeonAPI, NeonApiEndpoint
 
@@ -353,7 +352,6 @@ class NeonEnvBuilder:
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
-        pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
         storage_controller_port_override: Optional[int] = None,
@@ -405,8 +403,6 @@ class NeonEnvBuilder:
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
             )
 
-        self.pageserver_aux_file_policy = pageserver_aux_file_policy
-
         self.safekeeper_extra_opts = safekeeper_extra_opts
 
         self.storage_controller_port_override = storage_controller_port_override
@@ -467,7 +463,6 @@ class NeonEnvBuilder:
             timeline_id=env.initial_timeline,
             shard_count=initial_tenant_shard_count,
             shard_stripe_size=initial_tenant_shard_stripe_size,
-            aux_file_policy=self.pageserver_aux_file_policy,
         )
         assert env.initial_tenant == initial_tenant
         assert env.initial_timeline == initial_timeline
@@ -1027,7 +1022,6 @@ class NeonEnv:
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
-        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
 
         # Create the neon_local's `NeonLocalInitConf`
@@ -1323,7 +1317,6 @@ class NeonEnv:
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
-        aux_file_policy: Optional[AuxFileStore] = None,
     ) -> tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1340,7 +1333,6 @@ class NeonEnv:
             shard_stripe_size=shard_stripe_size,
             placement_policy=placement_policy,
             set_default=set_default,
-            aux_file_policy=aux_file_policy,
         )
 
         return tenant_id, timeline_id
@@ -1398,7 +1390,6 @@ def neon_simple_env(
     compatibility_pg_distrib_dir: Path,
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
-    pageserver_aux_file_policy: Optional[AuxFileStore],
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
@@ -1431,7 +1422,6 @@ def neon_simple_env(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
-        pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
         combination=combination,
@@ -1458,7 +1448,6 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
-    pageserver_aux_file_policy: Optional[AuxFileStore],
     record_property: Callable[[str, object], None],
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnvBuilder]:
@@ -1501,7 +1490,6 @@ def neon_env_builder(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
-        pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
     ) as builder:
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 4114c2fcb3..1131bf090f 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -10,12 +10,6 @@ from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
 
-if TYPE_CHECKING:
-    from typing import Any, Optional
-
-    from fixtures.utils import AuxFileStore
-
-
 if TYPE_CHECKING:
     from typing import Any, Optional
 
@@ -50,11 +44,6 @@ def pageserver_virtual_file_io_mode() -> Optional[str]:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
-@pytest.fixture(scope="function", autouse=True)
-def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
-    return None
-
-
 def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
     toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
     if toml_table is None:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index d12fa59abc..01b7cf1026 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import contextlib
-import enum
 import json
 import os
 import re
@@ -515,21 +514,6 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]):
     assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
 
 
-@enum.unique
-class AuxFileStore(str, enum.Enum):
-    V1 = "v1"
-    V2 = "v2"
-    CrossValidation = "cross-validation"
-
-    @override
-    def __repr__(self) -> str:
-        return f"'aux-{self.value}'"
-
-    @override
-    def __str__(self) -> str:
-        return f"'aux-{self.value}'"
-
-
 def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]):
     """
     This is essentially:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 815d186ab9..8b2a296bdd 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -9,7 +9,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.neon_fixtures import logical_replication_sync
 
 if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
 @pytest.mark.timeout(1000)
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 4a7017994d..83d003a5cc 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -172,7 +172,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "cross-validation",
         "lsn_lease_length": "1m",
         "lsn_lease_length_for_ts": "5s",
     }
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index c26bf058e2..30027463df 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -5,11 +5,9 @@ from functools import partial
 from random import choice
 from string import ascii_lowercase
 
-import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    AuxFileStore,
     NeonEnv,
     NeonEnvBuilder,
     PgProtocol,
@@ -23,17 +21,6 @@ def random_string(n: int):
     return "".join([choice(ascii_lowercase) for _ in range(n)])
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
-)
-def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
-    env = neon_simple_env
-    with env.pageserver.http_client() as client:
-        tenant_config = client.tenant_config(env.initial_tenant).effective_config
-        assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
-
-
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -173,7 +160,6 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -350,7 +336,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -395,7 +380,6 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -467,7 +451,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From f7c61e856f05e4a796ef82bff53e7b9a01b3d0f3 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Oct 2024 16:03:02 -0400
Subject: [PATCH 13/27] fix(pageserver): bump tokio-epoll-uring (#9546)

Includes https://github.com/neondatabase/tokio-epoll-uring/pull/58 that
fixes the clippy error.

## Summary of changes

Update the version of tokio-epoll-uring

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7fa5df29fd..610b607482 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6272,7 +6272,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6788,7 +6788,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
  "bytes",
  "io-uring",

From 7d5f6b6a528de4068ace6eac2d45305d91a7d011 Mon Sep 17 00:00:00 2001
From: George MacKerron <georgemackerron@neon.tech>
Date: Mon, 28 Oct 2024 20:06:36 +0000
Subject: [PATCH 14/27] Build `pgrag` extensions x3 (#8486)

Build the pgrag extensions (rag, rag_bge_small_en_v15, and
rag_jina_reranker_v1_tiny_en) as part of the compute node Dockerfile.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 compute/compute-node.Dockerfile | 81 +++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index dfed01daae..1b2167ea11 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -873,6 +873,85 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
 
 USER root
 
+#########################################################################################
+#
+# Layer "rust extensions pgrx12"
+#
+# pgrx started to support Postgres 17 since version 12,
+# but some older extension aren't compatible with it.
+# This layer should be used as a base for new pgrx extensions,
+# and eventually get merged with `rust-extensions-build`
+#
+#########################################################################################
+FROM build-deps AS rust-extensions-build-pgrx12
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y curl libclang-dev && \
+    useradd -ms /bin/bash nonroot -b /home
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
+    rm rustup-init && \
+    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
+#########################################################################################
+#
+# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Compile "pgrag" extensions
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+
+# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
+# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
+    python3 -m venv venv && \
+    . venv/bin/activate && \
+    python3 -m pip install cmake==3.30.5 && \
+    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
+
+
+FROM pg-onnx-build AS pgrag-pg-build
+
+RUN apt-get install -y protobuf-compiler && \
+    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
+    \
+    cd exts/rag && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
+    \
+    cd ../rag_bge_small_en_v15 && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
+    \
+    cd ../rag_jina_reranker_v1_tiny_en && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
+
+
 #########################################################################################
 #
 # Layer "pg-jsonschema-pg-build"
@@ -1085,6 +1164,7 @@ COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1274,6 +1354,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
 COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
 #COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src

From 062456561783d09cbe8eeedcf0a244d13866bd50 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 29 Oct 2024 00:47:15 +0200
Subject: [PATCH 15/27] Create the notion of unstable extensions

As a DBaaS provider, Neon needs to provide a stable platform for
customers to build applications upon. At the same time however, we also
need to enable customers to use the latest and greatest technology, so
they can prototype their work, and we can solicit feedback. If all
extensions are treated the same in terms of stability, it is hard to
meet that goal.

There are now two new GUCs created by the Neon extension:

neon.allow_unstable_extensions: This is a session GUC which allows
a session to install and load unstable extensions.

neon.unstable_extensions: This is a comma-separated list of extension
names. We can check if a CREATE EXTENSION statement is attempting to
install an unstable extension, and if so, deny the request if
neon.allow_unstable_extensions is not set to true.

Signed-off-by: Tristan Partin <tristan@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                            |   1 +
 pgxn/neon/neon.c                              |   2 +
 pgxn/neon/neon_pgversioncompat.c              |   1 +
 pgxn/neon/unstable_extensions.c               | 129 ++++++++++++++++++
 pgxn/neon/unstable_extensions.h               |   6 +
 .../regress/test_unstable_extensions.py       |  50 +++++++
 6 files changed, 189 insertions(+)
 create mode 100644 pgxn/neon/unstable_extensions.c
 create mode 100644 pgxn/neon/unstable_extensions.h
 create mode 100644 test_runner/regress/test_unstable_extensions.py

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 42f2a8efda..c87ae59fd6 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
+	unstable_extensions.o \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index f8ec725c18..dc87d79e87 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -30,6 +30,7 @@
 #include "neon.h"
 #include "control_plane_connector.h"
 #include "logical_replication_monitor.h"
+#include "unstable_extensions.h"
 #include "walsender_hooks.h"
 #if PG_MAJORVERSION_NUM >= 16
 #include "storage/ipc.h"
@@ -424,6 +425,7 @@ _PG_init(void)
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
+	InitUnstableExtensionsSupport();
 	InitLogicalReplicationMonitor();
 	InitControlPlaneConnector();
 
diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c
index a0dbddde4b..7c404fb5a9 100644
--- a/pgxn/neon/neon_pgversioncompat.c
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
 	MemoryContextSwitchTo(old_context);
 }
 #endif
+
diff --git a/pgxn/neon/unstable_extensions.c b/pgxn/neon/unstable_extensions.c
new file mode 100644
index 0000000000..a3445cb268
--- /dev/null
+++ b/pgxn/neon/unstable_extensions.c
@@ -0,0 +1,129 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "nodes/plannodes.h"
+#include "nodes/parsenodes.h"
+#include "tcop/utility.h"
+#include "utils/errcodes.h"
+#include "utils/guc.h"
+
+#include "neon_pgversioncompat.h"
+#include "unstable_extensions.h"
+
+static bool					allow_unstable_extensions = false;
+static char				   *unstable_extensions = NULL;
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+static bool
+list_contains(char const* comma_separated_list, char const* val)
+{
+	char const* occ = comma_separated_list;
+	size_t val_len = strlen(val);
+
+	if (val_len == 0)
+		return false;
+
+	while ((occ = strstr(occ, val)) != NULL)
+	{
+		if ((occ == comma_separated_list || occ[-1] == ',')
+			&& (occ[val_len] == '\0' || occ[val_len] == ','))
+		{
+			return true;
+		}
+		occ += val_len;
+	}
+
+	return false;
+}
+
+
+static void
+CheckUnstableExtension(
+	PlannedStmt *pstmt,
+	const char *queryString,
+	bool readOnlyTree,
+	ProcessUtilityContext context,
+	ParamListInfo params,
+	QueryEnvironment *queryEnv,
+	DestReceiver *dest,
+	QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	if (allow_unstable_extensions || unstable_extensions == NULL)
+		goto process;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreateExtensionStmt:
+		{
+			CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree);
+			if (list_contains(unstable_extensions, stmt->extname))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("installing %s is currently prohibited", stmt->extname),
+						 errhint("Set neon.allow_unstable_extensions to true")));
+			}
+			break;
+		}
+		default:
+			goto process;
+	}
+
+process:
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
+	}
+}
+
+void
+InitUnstableExtensionsSupport(void)
+{
+	DefineCustomBoolVariable(
+		"neon.allow_unstable_extensions",
+		"Allow unstable extensions to be installed and used",
+		NULL,
+		&allow_unstable_extensions,
+		false,
+		PGC_SUSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomStringVariable(
+		"neon.unstable_extensions",
+		"Allow unstable extensions to be installed and used",
+		NULL,
+		&unstable_extensions,
+		NULL,
+		PGC_SUSET,
+		0,
+		NULL, NULL, NULL);
+
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = CheckUnstableExtension;
+}
diff --git a/pgxn/neon/unstable_extensions.h b/pgxn/neon/unstable_extensions.h
new file mode 100644
index 0000000000..3c695e9fb2
--- /dev/null
+++ b/pgxn/neon/unstable_extensions.h
@@ -0,0 +1,6 @@
+#ifndef __NEON_UNSTABLE_EXTENSIONS_H__
+#define __NEON_UNSTABLE_EXTENSIONS_H__
+
+void InitUnstableExtensionsSupport(void);
+
+#endif
diff --git a/test_runner/regress/test_unstable_extensions.py b/test_runner/regress/test_unstable_extensions.py
new file mode 100644
index 0000000000..06a62ccfd8
--- /dev/null
+++ b/test_runner/regress/test_unstable_extensions.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from psycopg2.errors import InsufficientPrivilege
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+
+def test_unstable_extensions_installation(neon_simple_env: NeonEnv):
+    """
+    Test that the unstable extension support within the neon extension can
+    block extension installation.
+    """
+    env = neon_simple_env
+
+    neon_unstable_extensions = "pg_prewarm,amcheck"
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "neon.allow_unstable_extensions=false",
+            f"neon.unstable_extensions='{neon_unstable_extensions}'",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT current_setting('neon.unstable_extensions')")
+        result = cursor.fetchone()
+        assert result is not None
+        setting = cast("str", result[0])
+        assert setting == neon_unstable_extensions
+
+        with pytest.raises(InsufficientPrivilege):
+            cursor.execute("CREATE EXTENSION pg_prewarm")
+
+        with pytest.raises(InsufficientPrivilege):
+            cursor.execute("CREATE EXTENSION amcheck")
+
+        # Make sure that we can install a "stable" extension
+        cursor.execute("CREATE EXTENSION pageinspect")
+
+        cursor.execute("BEGIN")
+        cursor.execute("SET neon.allow_unstable_extensions TO true")
+        cursor.execute("CREATE EXTENSION pg_prewarm")
+        cursor.execute("COMMIT")

From 4df3987054a7cef88322713b9c4a0e3b1a706131 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Oct 2024 18:21:45 -0500
Subject: [PATCH 16/27] Get role name when not a C string

We will only have a C string if the specified role is a string.
Otherwise, we need to resolve references to public, current_role,
current_user, and session_user.

Fixes: https://github.com/neondatabase/cloud/issues/19323
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/control_plane_connector.c        | 12 +++++++-
 test_runner/regress/test_ddl_forwarding.py | 32 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 4713103909..b47b22cd20 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -18,6 +18,7 @@
  *
  *-------------------------------------------------------------------------
  */
+
 #include "postgres.h"
 
 #include <curl/curl.h>
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
+	Assert(role_name);
+
 	return strcmp(role_name, "neon_superuser") == 0;
 }
 
@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
 static void
 HandleAlterRole(AlterRoleStmt *stmt)
 {
-	const char *role_name = stmt->role->rolename;
+	char	   *role_name;
 	DefElem    *dpass;
 	ListCell   *option;
 	bool		found = false;
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 
 	InitRoleTableIfNeeded();
 
+	role_name = get_rolespec_name(stmt->role);
 	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");
 
@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
 		if (strcmp(defel->defname, "password") == 0)
 			dpass = defel;
 	}
+
 	/* We only care about updates to the password */
 	if (!dpass)
+	{
+		pfree(role_name);
 		return;
+	}
 
 	entry = hash_search(CurrentDdlTable->role_table,
 						role_name,
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	else
 		entry->password = NULL;
 	entry->type = Op_Set;
+
+	pfree(role_name);
 }
 
 static void
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 96657b3ce4..e517e83e6f 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -7,6 +7,7 @@ import psycopg2
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, VanillaPostgres
+from psycopg2.errors import UndefinedObject
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -335,3 +336,34 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
         if not result:
             raise AssertionError("Could not count databases")
         assert result[0] == 0, "Database 'failure' still exists after restart"
+
+
+def test_ddl_forwarding_role_specs(neon_simple_env: NeonEnv):
+    """
+    Postgres has a concept of role specs:
+
+        ROLESPEC_CSTRING: ALTER ROLE xyz
+        ROLESPEC_CURRENT_USER: ALTER ROLE current_user
+        ROLESPEC_CURRENT_ROLE: ALTER ROLE current_role
+        ROLESPEC_SESSION_USER: ALTER ROLE session_user
+        ROLESPEC_PUBLIC: ALTER ROLE public
+
+    The extension is required to serialize these special role spec into
+    usernames for the purpose of DDL forwarding.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    with endpoint.cursor() as cur:
+        # ROLESPEC_CSTRING
+        cur.execute("ALTER ROLE cloud_admin WITH PASSWORD 'york'")
+        # ROLESPEC_CURRENT_USER
+        cur.execute("ALTER ROLE current_user WITH PASSWORD 'pork'")
+        # ROLESPEC_CURRENT_ROLE
+        cur.execute("ALTER ROLE current_role WITH PASSWORD 'cork'")
+        # ROLESPEC_SESSION_USER
+        cur.execute("ALTER ROLE session_user WITH PASSWORD 'bork'")
+        # ROLESPEC_PUBLIC
+        with pytest.raises(UndefinedObject):
+            cur.execute("ALTER ROLE public WITH PASSWORD 'dork'")

From 62f5d484d994be08eaedd7b6627f194b91e7b93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 29 Oct 2024 01:36:05 +0100
Subject: [PATCH 17/27] Assert the tenant to be active in `unoffload_timeline`
 (#9539)

Currently, all callers of `unoffload_timeline` ensure that the tenant
the unoffload operation is called on is active. We rely on it being
active as we activate the timeline below and don't want to race with the
activation code of the tenant (in the worst case, activating a timeline
twice).

Therefore, add this assertion.

Part of #8088
---
 pageserver/src/tenant.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e871cada..7011ae9e63 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1830,6 +1830,18 @@ impl Tenant {
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
+
+        // We activate the timeline below manually, so this must be called on an active timeline.
+        // We expect callers of this function to ensure this.
+        match self.current_state() {
+            TenantState::Activating { .. }
+            | TenantState::Attaching
+            | TenantState::Broken { .. } => {
+                panic!("Timeline expected to be active")
+            }
+            TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
+            TenantState::Active => {}
+        }
         let cancel = self.cancel.clone();
 
         // Protect against concurrent attempts to use this TimelineId

From 07b974480c642bc79a63cfd0d456a607533fe966 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 29 Oct 2024 10:00:34 +0000
Subject: [PATCH 18/27] pageserver: move things around to prepare for decoding
 logic (#9504)

## Problem

We wish to have high level WAL decoding logic in `wal_decoder::decoder`
module.

## Summary of Changes

For this we need the `Value` and `NeonWalRecord` types accessible there, so:
1. Move `Value` and `NeonWalRecord` to `pageserver::value` and
`pageserver::record` respectively.
2. Get rid of `pageserver::repository` (follow up from (1))
3. Move PG specific WAL record types to `postgres_ffi::walrecord`. In
theory they could live in `wal_decoder`, but it would create a circular
dependency between `wal_decoder` and `postgres_ffi`. Long term it makes
sense for those types to be PG version specific, so that will work out nicely.
4. Move higher level WAL record types (to be ingested by pageserver)
into `wal_decoder::models`

Related: https://github.com/neondatabase/neon/issues/9335
Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |  16 +
 Cargo.toml                                    |   2 +
 libs/pageserver_api/src/lib.rs                |   2 +
 libs/pageserver_api/src/record.rs             | 113 +++
 .../pageserver_api/src/value.rs               |  80 +-
 libs/postgres_ffi/Cargo.toml                  |   1 +
 libs/postgres_ffi/src/lib.rs                  |   1 +
 .../postgres_ffi}/src/walrecord.rs            | 942 ++++++++----------
 libs/wal_decoder/Cargo.toml                   |  18 +
 libs/wal_decoder/src/decoder.rs               |   1 +
 libs/wal_decoder/src/lib.rs                   |   2 +
 libs/wal_decoder/src/models.rs                | 167 ++++
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_ingest.rs            |   3 +-
 pageserver/benches/bench_layer_map.rs         |   2 +-
 pageserver/benches/bench_walredo.rs           |   3 +-
 pageserver/ctl/src/draw_timeline_dir.rs       |   2 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   2 +-
 pageserver/src/deletion_queue.rs              |   3 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/import_datadir.rs              |   3 +-
 pageserver/src/lib.rs                         |   2 -
 pageserver/src/pgdatadir_mapping.rs           |   6 +-
 pageserver/src/tenant.rs                      |  32 +-
 pageserver/src/tenant/gc_result.rs            |  57 ++
 pageserver/src/tenant/layer_map.rs            |   2 +-
 pageserver/src/tenant/mgr.rs                  |   2 +-
 pageserver/src/tenant/storage_layer.rs        |   4 +-
 .../storage_layer/batch_split_writer.rs       |   3 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  14 +-
 .../tenant/storage_layer/filter_iterator.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   5 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   3 +-
 .../src/tenant/storage_layer/layer/tests.rs   |   4 +-
 .../src/tenant/storage_layer/layer_desc.rs    |   2 +-
 .../src/tenant/storage_layer/layer_name.rs    |   2 +-
 .../tenant/storage_layer/merge_iterator.rs    |  19 +-
 pageserver/src/tenant/timeline.rs             |  21 +-
 pageserver/src/tenant/timeline/compaction.rs  |   9 +-
 .../walreceiver/walreceiver_connection.rs     |   2 +-
 pageserver/src/walingest.rs                   | 151 +--
 pageserver/src/walredo.rs                     |   9 +-
 pageserver/src/walredo/apply_neon.rs          |   4 +-
 pageserver/src/walredo/process.rs             |   2 +-
 45 files changed, 925 insertions(+), 806 deletions(-)
 create mode 100644 libs/pageserver_api/src/record.rs
 rename pageserver/src/repository.rs => libs/pageserver_api/src/value.rs (73%)
 rename {pageserver => libs/postgres_ffi}/src/walrecord.rs (88%)
 create mode 100644 libs/wal_decoder/Cargo.toml
 create mode 100644 libs/wal_decoder/src/decoder.rs
 create mode 100644 libs/wal_decoder/src/lib.rs
 create mode 100644 libs/wal_decoder/src/models.rs
 create mode 100644 pageserver/src/tenant/gc_result.rs

diff --git a/Cargo.lock b/Cargo.lock
index 610b607482..c5af247e8b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3749,6 +3749,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "wal_decoder",
  "walkdir",
  "workspace_hack",
 ]
@@ -4186,6 +4187,7 @@ dependencies = [
  "regex",
  "serde",
  "thiserror",
+ "tracing",
  "utils",
 ]
 
@@ -6954,6 +6956,20 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "wal_decoder"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "pageserver_api",
+ "postgres_ffi",
+ "serde",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.3"
diff --git a/Cargo.toml b/Cargo.toml
index 4c6a24ecde..7f9a766ff9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ members = [
     "libs/postgres_ffi/wal_craft",
     "libs/vm_monitor",
     "libs/walproposer",
+    "libs/wal_decoder",
 ]
 
 [workspace.package]
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
 walproposer = { version = "0.1", path = "./libs/walproposer/" }
+wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
 
 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 532185a366..ff705e79cd 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,9 +5,11 @@ pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
+pub mod record;
 pub mod reltag;
 pub mod shard;
 /// Public API types
 pub mod upcall_api;
+pub mod value;
 
 pub mod config;
diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
new file mode 100644
index 0000000000..b80ed2f203
--- /dev/null
+++ b/libs/pageserver_api/src/record.rs
@@ -0,0 +1,113 @@
+//! This module defines the WAL record format used within the pageserver.
+
+use bytes::Bytes;
+use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
+use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
+use serde::{Deserialize, Serialize};
+use utils::bin_ser::DeserializeError;
+
+/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
+/// around a PostgreSQL WAL record, or a custom neon-specific "record".
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum NeonWalRecord {
+    /// Native PostgreSQL WAL record
+    Postgres { will_init: bool, rec: Bytes },
+
+    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
+    ClearVisibilityMapFlags {
+        new_heap_blkno: Option<u32>,
+        old_heap_blkno: Option<u32>,
+        flags: u8,
+    },
+    /// Mark transaction IDs as committed on a CLOG page
+    ClogSetCommitted {
+        xids: Vec<TransactionId>,
+        timestamp: TimestampTz,
+    },
+    /// Mark transaction IDs as aborted on a CLOG page
+    ClogSetAborted { xids: Vec<TransactionId> },
+    /// Extend multixact offsets SLRU
+    MultixactOffsetCreate {
+        mid: MultiXactId,
+        moff: MultiXactOffset,
+    },
+    /// Extend multixact members SLRU.
+    MultixactMembersCreate {
+        moff: MultiXactOffset,
+        members: Vec<MultiXactMember>,
+    },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
+
+    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
+    #[cfg(feature = "testing")]
+    Test {
+        /// Append a string to the image.
+        append: String,
+        /// Clear the image before appending.
+        clear: bool,
+        /// Treat this record as an init record. `clear` should be set to true if this field is set
+        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
+        /// its references in `timeline.rs`.
+        will_init: bool,
+    },
+}
+
+impl NeonWalRecord {
+    /// Does replaying this WAL record initialize the page from scratch, or does
+    /// it need to be applied over the previous image of the page?
+    pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
+        match self {
+            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
+            #[cfg(feature = "testing")]
+            NeonWalRecord::Test { will_init, .. } => *will_init,
+            // None of the special neon record types currently initialize the page
+            _ => false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append(s: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_clear() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_init() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: true,
+        }
+    }
+}
+
+/// Build a human-readable string to describe a WAL record
+///
+/// For debugging purposes
+pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
+    match rec {
+        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
+            "will_init: {}, {}",
+            will_init,
+            describe_postgres_wal_record(rec)?
+        )),
+        _ => Ok(format!("{:?}", rec)),
+    }
+}
diff --git a/pageserver/src/repository.rs b/libs/pageserver_api/src/value.rs
similarity index 73%
rename from pageserver/src/repository.rs
rename to libs/pageserver_api/src/value.rs
index e4ebafd927..1f8ed30a9a 100644
--- a/pageserver/src/repository.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -1,13 +1,16 @@
-use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
+//! This module defines the value type used by the storage engine.
+//!
+//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
+//! or a "delta" of how to get from previous version of the value to the new one
+//! ([`Value::WalRecord`]])
+//!
+//! Note that the [`Value`] type is used for the permananent storage format, so any
+//! changes to it must be backwards compatible.
+
+use crate::record::NeonWalRecord;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::AddAssign;
-use std::time::Duration;
 
-pub use pageserver_api::key::{Key, KEY_SIZE};
-
-/// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
     /// An Image value contains a full copy of the value
@@ -20,10 +23,12 @@ pub enum Value {
 }
 
 impl Value {
+    #[inline(always)]
     pub fn is_image(&self) -> bool {
         matches!(self, Value::Image(_))
     }
 
+    #[inline(always)]
     pub fn will_init(&self) -> bool {
         match self {
             Value::Image(_) => true,
@@ -33,17 +38,18 @@ impl Value {
 }
 
 #[derive(Debug, PartialEq)]
-pub(crate) enum InvalidInput {
+pub enum InvalidInput {
     TooShortValue,
     TooShortPostgresRecord,
 }
 
 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
-pub(crate) struct ValueBytes;
+pub struct ValueBytes;
 
 impl ValueBytes {
-    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+    #[inline(always)]
+    pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
         if raw.len() < 12 {
             return Err(InvalidInput::TooShortValue);
         }
@@ -79,6 +85,7 @@ impl ValueBytes {
 mod test {
     use super::*;
 
+    use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
     macro_rules! roundtrip {
@@ -229,56 +236,3 @@ mod test {
         assert!(!ValueBytes::will_init(&expected).unwrap());
     }
 }
-
-///
-/// Result of performing GC
-///
-#[derive(Default, Serialize, Debug)]
-pub struct GcResult {
-    pub layers_total: u64,
-    pub layers_needed_by_cutoff: u64,
-    pub layers_needed_by_pitr: u64,
-    pub layers_needed_by_branches: u64,
-    pub layers_needed_by_leases: u64,
-    pub layers_not_updated: u64,
-    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
-
-    #[serde(serialize_with = "serialize_duration_as_millis")]
-    pub elapsed: Duration,
-
-    /// The layers which were garbage collected.
-    ///
-    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
-    /// dropped in tests.
-    #[cfg(feature = "testing")]
-    #[serde(skip)]
-    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
-}
-
-// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
-fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-{
-    d.as_millis().serialize(serializer)
-}
-
-impl AddAssign for GcResult {
-    fn add_assign(&mut self, other: Self) {
-        self.layers_total += other.layers_total;
-        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
-        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
-        self.layers_needed_by_branches += other.layers_needed_by_branches;
-        self.layers_needed_by_leases += other.layers_needed_by_leases;
-        self.layers_not_updated += other.layers_not_updated;
-        self.layers_removed += other.layers_removed;
-
-        self.elapsed += other.elapsed;
-
-        #[cfg(feature = "testing")]
-        {
-            let mut other = other;
-            self.doomed_layers.append(&mut other.doomed_layers);
-        }
-    }
-}
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index ef17833a48..e1f5443cbe 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -15,6 +15,7 @@ memoffset.workspace = true
 thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
+tracing.workspace = true
 
 [dev-dependencies]
 env_logger.workspace = true
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0d46ed6aac..6b219488ac 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -217,6 +217,7 @@ macro_rules! enum_pgversion {
 
 pub mod pg_constants;
 pub mod relfile_utils;
+pub mod walrecord;
 
 // Export some widely used datatypes that are unlikely to change across Postgres versions
 pub use v14::bindings::RepOriginId;
diff --git a/pageserver/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs
similarity index 88%
rename from pageserver/src/walrecord.rs
rename to libs/postgres_ffi/src/walrecord.rs
index dd199e2c55..dedbaef64d 100644
--- a/pageserver/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -1,107 +1,144 @@
+//! This module houses types used in decoding of PG WAL
+//! records.
 //!
-//! Functions for parsing WAL records.
-//!
+//! TODO: Generate separate types for each supported PG version
 
-use anyhow::Result;
+use crate::pg_constants;
+use crate::XLogRecord;
+use crate::{
+    BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz,
+    TransactionId,
+};
+use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD};
 use bytes::{Buf, Bytes};
-use postgres_ffi::dispatch_pgversion;
-use postgres_ffi::pg_constants;
-use postgres_ffi::BLCKSZ;
-use postgres_ffi::{BlockNumber, TimestampTz};
-use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
-use tracing::*;
-use utils::{bin_ser::DeserializeError, lsn::Lsn};
+use utils::bin_ser::DeserializeError;
+use utils::lsn::Lsn;
 
-/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
-/// around a PostgreSQL WAL record, or a custom neon-specific "record".
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub enum NeonWalRecord {
-    /// Native PostgreSQL WAL record
-    Postgres { will_init: bool, rec: Bytes },
-
-    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
-    ClearVisibilityMapFlags {
-        new_heap_blkno: Option<u32>,
-        old_heap_blkno: Option<u32>,
-        flags: u8,
-    },
-    /// Mark transaction IDs as committed on a CLOG page
-    ClogSetCommitted {
-        xids: Vec<TransactionId>,
-        timestamp: TimestampTz,
-    },
-    /// Mark transaction IDs as aborted on a CLOG page
-    ClogSetAborted { xids: Vec<TransactionId> },
-    /// Extend multixact offsets SLRU
-    MultixactOffsetCreate {
-        mid: MultiXactId,
-        moff: MultiXactOffset,
-    },
-    /// Extend multixact members SLRU.
-    MultixactMembersCreate {
-        moff: MultiXactOffset,
-        members: Vec<MultiXactMember>,
-    },
-    /// Update the map of AUX files, either writing or dropping an entry
-    AuxFile {
-        file_path: String,
-        content: Option<Bytes>,
-    },
-
-    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
-    #[cfg(test)]
-    Test {
-        /// Append a string to the image.
-        append: String,
-        /// Clear the image before appending.
-        clear: bool,
-        /// Treat this record as an init record. `clear` should be set to true if this field is set
-        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
-        /// its references in `timeline.rs`.
-        will_init: bool,
-    },
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactCreate {
+    pub mid: MultiXactId,
+    /* new MultiXact's ID */
+    pub moff: MultiXactOffset,
+    /* its starting offset in members file */
+    pub nmembers: u32,
+    /* number of member XIDs */
+    pub members: Vec<MultiXactMember>,
 }
 
-impl NeonWalRecord {
-    /// Does replaying this WAL record initialize the page from scratch, or does
-    /// it need to be applied over the previous image of the page?
-    pub fn will_init(&self) -> bool {
-        // If you change this function, you'll also need to change ValueBytes::will_init
-        match self {
-            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-            #[cfg(test)]
-            NeonWalRecord::Test { will_init, .. } => *will_init,
-            // None of the special neon record types currently initialize the page
-            _ => false,
+impl XlMultiXactCreate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
+        let mid = buf.get_u32_le();
+        let moff = buf.get_u32_le();
+        let nmembers = buf.get_u32_le();
+        let mut members = Vec::new();
+        for _ in 0..nmembers {
+            members.push(MultiXactMember::decode(buf));
+        }
+        XlMultiXactCreate {
+            mid,
+            moff,
+            nmembers,
+            members,
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactTruncate {
+    pub oldest_multi_db: Oid,
+    /* to-be-truncated range of multixact offsets */
+    pub start_trunc_off: MultiXactId,
+    /* just for completeness' sake */
+    pub end_trunc_off: MultiXactId,
+
+    /* to-be-truncated range of multixact members */
+    pub start_trunc_memb: MultiXactOffset,
+    pub end_trunc_memb: MultiXactOffset,
+}
+
+impl XlMultiXactTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
+        XlMultiXactTruncate {
+            oldest_multi_db: buf.get_u32_le(),
+            start_trunc_off: buf.get_u32_le(),
+            end_trunc_off: buf.get_u32_le(),
+            start_trunc_memb: buf.get_u32_le(),
+            end_trunc_memb: buf.get_u32_le(),
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_clear() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: false,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRelmapUpdate {
+    pub dbid: Oid,   /* database ID, or 0 for shared map */
+    pub tsid: Oid,   /* database's tablespace, or pg_global */
+    pub nbytes: i32, /* size of relmap data */
+}
+
+impl XlRelmapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
+        XlRelmapUpdate {
+            dbid: buf.get_u32_le(),
+            tsid: buf.get_u32_le(),
+            nbytes: buf.get_i32_le(),
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_init() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: true,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginDrop {
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginDrop {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
+        XlReploriginDrop {
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginSet {
+    pub remote_lsn: Lsn,
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginSet {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
+        XlReploriginSet {
+            remote_lsn: Lsn(buf.get_u64_le()),
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct RelFileNode {
+    pub spcnode: Oid, /* tablespace */
+    pub dbnode: Oid,  /* database */
+    pub relnode: Oid, /* relation */
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct MultiXactMember {
+    pub xid: TransactionId,
+    pub status: MultiXactStatus,
+}
+
+impl MultiXactMember {
+    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
+        MultiXactMember {
+            xid: buf.get_u32_le(),
+            status: buf.get_u32_le(),
         }
     }
 }
@@ -164,17 +201,17 @@ impl DecodedWALRecord {
     /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
     /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
     /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
-    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+    pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
         if self.xl_rmid == pg_constants::RM_DBASE_ID {
             let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             match pg_version {
                 14 => {
                     // Postgres 14 database creations are always the legacy kind
-                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                    info == crate::v14::bindings::XLOG_DBASE_CREATE
                 }
-                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 _ => {
                     panic!("Unsupported postgres version {pg_version}")
                 }
@@ -185,35 +222,294 @@ impl DecodedWALRecord {
     }
 }
 
-#[repr(C)]
-#[derive(Debug, Clone, Copy)]
-pub struct RelFileNode {
-    pub spcnode: Oid, /* tablespace */
-    pub dbnode: Oid,  /* database */
-    pub relnode: Oid, /* relation */
-}
+/// Main routine to decode a WAL record and figure out which blocks are modified
+//
+// See xlogrecord.h for details
+// The overall layout of an XLOG record is:
+//		Fixed-size header (XLogRecord struct)
+//      XLogRecordBlockHeader struct
+//          If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
+//	           If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
+//	           XLogRecordBlockCompressHeader struct follows.
+//          If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
+//          BlockNumber follows
+//      XLogRecordBlockHeader struct
+//      ...
+//      XLogRecordDataHeader[Short|Long] struct
+//      block data
+//      block data
+//      ...
+//      main data
+//
+//
+// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in.
+// It would be more natural for this function to return a DecodedWALRecord as return value,
+// but reusing the caller-supplied struct avoids an allocation.
+// This code is in the hot path for digesting incoming WAL, and is very performance sensitive.
+//
+pub fn decode_wal_record(
+    record: Bytes,
+    decoded: &mut DecodedWALRecord,
+    pg_version: u32,
+) -> anyhow::Result<()> {
+    let mut rnode_spcnode: u32 = 0;
+    let mut rnode_dbnode: u32 = 0;
+    let mut rnode_relnode: u32 = 0;
+    let mut got_rnode = false;
+    let mut origin_id: u16 = 0;
 
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlRelmapUpdate {
-    pub dbid: Oid,   /* database ID, or 0 for shared map */
-    pub tsid: Oid,   /* database's tablespace, or pg_global */
-    pub nbytes: i32, /* size of relmap data */
-}
+    let mut buf = record.clone();
 
-impl XlRelmapUpdate {
-    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
-        XlRelmapUpdate {
-            dbid: buf.get_u32_le(),
-            tsid: buf.get_u32_le(),
-            nbytes: buf.get_i32_le(),
+    // 1. Parse XLogRecord struct
+
+    // FIXME: assume little-endian here
+    let xlogrec = XLogRecord::from_bytes(&mut buf)?;
+
+    tracing::trace!(
+        "decode_wal_record xl_rmid = {} xl_info = {}",
+        xlogrec.xl_rmid,
+        xlogrec.xl_info
+    );
+
+    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
+
+    if buf.remaining() != remaining {
+        //TODO error
+    }
+
+    let mut max_block_id = 0;
+    let mut blocks_total_len: u32 = 0;
+    let mut main_data_len = 0;
+    let mut datatotal: u32 = 0;
+    decoded.blocks.clear();
+
+    // 2. Decode the headers.
+    // XLogRecordBlockHeaders if any,
+    // XLogRecordDataHeader[Short|Long]
+    while buf.remaining() > datatotal as usize {
+        let block_id = buf.get_u8();
+
+        match block_id {
+            pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
+                /* XLogRecordDataHeaderShort */
+                main_data_len = buf.get_u8() as u32;
+                datatotal += main_data_len;
+            }
+
+            pg_constants::XLR_BLOCK_ID_DATA_LONG => {
+                /* XLogRecordDataHeaderLong */
+                main_data_len = buf.get_u32_le();
+                datatotal += main_data_len;
+            }
+
+            pg_constants::XLR_BLOCK_ID_ORIGIN => {
+                // RepOriginId is uint16
+                origin_id = buf.get_u16_le();
+            }
+
+            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
+                // TransactionId is uint32
+                buf.advance(4);
+            }
+
+            0..=pg_constants::XLR_MAX_BLOCK_ID => {
+                /* XLogRecordBlockHeader */
+                let mut blk = DecodedBkpBlock::new();
+
+                if block_id <= max_block_id {
+                    // TODO
+                    //report_invalid_record(state,
+                    //			  "out-of-order block_id %u at %X/%X",
+                    //			  block_id,
+                    //			  (uint32) (state->ReadRecPtr >> 32),
+                    //			  (uint32) state->ReadRecPtr);
+                    //    goto err;
+                }
+                max_block_id = block_id;
+
+                let fork_flags: u8 = buf.get_u8();
+                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
+                blk.flags = fork_flags;
+                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
+                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
+                blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
+                blk.data_len = buf.get_u16_le();
+
+                /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
+
+                datatotal += blk.data_len as u32;
+                blocks_total_len += blk.data_len as u32;
+
+                if blk.has_image {
+                    blk.bimg_len = buf.get_u16_le();
+                    blk.hole_offset = buf.get_u16_le();
+                    blk.bimg_info = buf.get_u8();
+
+                    blk.apply_image = dispatch_pgversion!(
+                        pg_version,
+                        (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0
+                    );
+
+                    let blk_img_is_compressed =
+                        crate::bkpimage_is_compressed(blk.bimg_info, pg_version);
+
+                    if blk_img_is_compressed {
+                        tracing::debug!("compressed block image , pg_version = {}", pg_version);
+                    }
+
+                    if blk_img_is_compressed {
+                        if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
+                            blk.hole_length = buf.get_u16_le();
+                        } else {
+                            blk.hole_length = 0;
+                        }
+                    } else {
+                        blk.hole_length = BLCKSZ - blk.bimg_len;
+                    }
+                    datatotal += blk.bimg_len as u32;
+                    blocks_total_len += blk.bimg_len as u32;
+
+                    /*
+                     * cross-check that hole_offset > 0, hole_length > 0 and
+                     * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
+                        && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+                                      (unsigned int) blk->hole_offset,
+                                      (unsigned int) blk->hole_length,
+                                      (unsigned int) blk->bimg_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that hole_offset == 0 and hole_length == 0 if
+                     * the HAS_HOLE flag is not set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
+                        && (blk.hole_offset != 0 || blk.hole_length != 0)
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+                                      (unsigned int) blk->hole_offset,
+                                      (unsigned int) blk->hole_length,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
+                     * flag is set.
+                     */
+                    if !blk_img_is_compressed && blk.bimg_len == BLCKSZ {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
+                                      (unsigned int) blk->bimg_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
+                     * IS_COMPRESSED flag is set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
+                        && !blk_img_is_compressed
+                        && blk.bimg_len != BLCKSZ
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
+                                      (unsigned int) blk->data_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+                }
+                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
+                    rnode_spcnode = buf.get_u32_le();
+                    rnode_dbnode = buf.get_u32_le();
+                    rnode_relnode = buf.get_u32_le();
+                    got_rnode = true;
+                } else if !got_rnode {
+                    // TODO
+                    /*
+                    report_invalid_record(state,
+                                    "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+                                    (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                    goto err;           */
+                }
+
+                blk.rnode_spcnode = rnode_spcnode;
+                blk.rnode_dbnode = rnode_dbnode;
+                blk.rnode_relnode = rnode_relnode;
+
+                blk.blkno = buf.get_u32_le();
+                tracing::trace!(
+                    "this record affects {}/{}/{} blk {}",
+                    rnode_spcnode,
+                    rnode_dbnode,
+                    rnode_relnode,
+                    blk.blkno
+                );
+
+                decoded.blocks.push(blk);
+            }
+
+            _ => {
+                // TODO: invalid block_id
+            }
         }
     }
+
+    // 3. Decode blocks.
+    let mut ptr = record.len() - buf.remaining();
+    for blk in decoded.blocks.iter_mut() {
+        if blk.has_image {
+            blk.bimg_offset = ptr as u32;
+            ptr += blk.bimg_len as usize;
+        }
+        if blk.has_data {
+            ptr += blk.data_len as usize;
+        }
+    }
+    // We don't need them, so just skip blocks_total_len bytes
+    buf.advance(blocks_total_len as usize);
+    assert_eq!(ptr, record.len() - buf.remaining());
+
+    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
+
+    // 4. Decode main_data
+    if main_data_len > 0 {
+        assert_eq!(buf.remaining(), main_data_len as usize);
+    }
+
+    decoded.xl_xid = xlogrec.xl_xid;
+    decoded.xl_info = xlogrec.xl_info;
+    decoded.xl_rmid = xlogrec.xl_rmid;
+    decoded.record = record;
+    decoded.origin_id = origin_id;
+    decoded.main_data_offset = main_data_offset;
+
+    Ok(())
 }
 
 pub mod v14 {
+    use crate::{OffsetNumber, TransactionId};
     use bytes::{Buf, Bytes};
-    use postgres_ffi::{OffsetNumber, TransactionId};
 
     #[repr(C)]
     #[derive(Debug)]
@@ -383,8 +679,8 @@ pub mod v15 {
 
 pub mod v16 {
     pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
+    use crate::{OffsetNumber, TransactionId};
     use bytes::{Buf, Bytes};
-    use postgres_ffi::{OffsetNumber, TransactionId};
 
     pub struct XlHeapDelete {
         pub xmax: TransactionId,
@@ -450,8 +746,8 @@ pub mod v16 {
 
     /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
     pub mod rm_neon {
+        use crate::{OffsetNumber, TransactionId};
         use bytes::{Buf, Bytes};
-        use postgres_ffi::{OffsetNumber, TransactionId};
 
         #[repr(C)]
         #[derive(Debug)]
@@ -563,8 +859,8 @@ pub mod v16 {
 
 pub mod v17 {
     pub use super::v14::XlHeapLockUpdated;
+    pub use crate::{TimeLineID, TimestampTz};
     use bytes::{Buf, Bytes};
-    pub use postgres_ffi::{TimeLineID, TimestampTz};
 
     pub use super::v16::rm_neon;
     pub use super::v16::{
@@ -742,7 +1038,7 @@ impl XlXactParsedRecord {
                 let spcnode = buf.get_u32_le();
                 let dbnode = buf.get_u32_le();
                 let relnode = buf.get_u32_le();
-                trace!(
+                tracing::trace!(
                     "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
                     spcnode,
                     dbnode,
@@ -756,9 +1052,9 @@ impl XlXactParsedRecord {
             }
         }
 
-        if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
+        if xinfo & crate::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
             let nitems = buf.get_i32_le();
-            debug!(
+            tracing::debug!(
                 "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
                 nitems
             );
@@ -778,7 +1074,7 @@ impl XlXactParsedRecord {
 
         if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
             xid = buf.get_u32_le();
-            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
+            tracing::debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
         }
 
         let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
@@ -822,78 +1118,6 @@ impl XlClogTruncate {
     }
 }
 
-#[repr(C)]
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub struct MultiXactMember {
-    pub xid: TransactionId,
-    pub status: MultiXactStatus,
-}
-
-impl MultiXactMember {
-    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
-        MultiXactMember {
-            xid: buf.get_u32_le(),
-            status: buf.get_u32_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlMultiXactCreate {
-    pub mid: MultiXactId,
-    /* new MultiXact's ID */
-    pub moff: MultiXactOffset,
-    /* its starting offset in members file */
-    pub nmembers: u32,
-    /* number of member XIDs */
-    pub members: Vec<MultiXactMember>,
-}
-
-impl XlMultiXactCreate {
-    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
-        let mid = buf.get_u32_le();
-        let moff = buf.get_u32_le();
-        let nmembers = buf.get_u32_le();
-        let mut members = Vec::new();
-        for _ in 0..nmembers {
-            members.push(MultiXactMember::decode(buf));
-        }
-        XlMultiXactCreate {
-            mid,
-            moff,
-            nmembers,
-            members,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlMultiXactTruncate {
-    pub oldest_multi_db: Oid,
-    /* to-be-truncated range of multixact offsets */
-    pub start_trunc_off: MultiXactId,
-    /* just for completeness' sake */
-    pub end_trunc_off: MultiXactId,
-
-    /* to-be-truncated range of multixact members */
-    pub start_trunc_memb: MultiXactOffset,
-    pub end_trunc_memb: MultiXactOffset,
-}
-
-impl XlMultiXactTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
-        XlMultiXactTruncate {
-            oldest_multi_db: buf.get_u32_le(),
-            start_trunc_off: buf.get_u32_le(),
-            end_trunc_off: buf.get_u32_le(),
-            start_trunc_memb: buf.get_u32_le(),
-            end_trunc_memb: buf.get_u32_le(),
-        }
-    }
-}
-
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlLogicalMessage {
@@ -950,337 +1174,7 @@ impl XlRunningXacts {
     }
 }
 
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginDrop {
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginDrop {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
-        XlReploriginDrop {
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginSet {
-    pub remote_lsn: Lsn,
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginSet {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
-        XlReploriginSet {
-            remote_lsn: Lsn(buf.get_u64_le()),
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-/// Main routine to decode a WAL record and figure out which blocks are modified
-//
-// See xlogrecord.h for details
-// The overall layout of an XLOG record is:
-//		Fixed-size header (XLogRecord struct)
-//      XLogRecordBlockHeader struct
-//          If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
-//	           If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
-//	           XLogRecordBlockCompressHeader struct follows.
-//          If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
-//          BlockNumber follows
-//      XLogRecordBlockHeader struct
-//      ...
-//      XLogRecordDataHeader[Short|Long] struct
-//      block data
-//      block data
-//      ...
-//      main data
-//
-//
-// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in.
-// It would be more natural for this function to return a DecodedWALRecord as return value,
-// but reusing the caller-supplied struct avoids an allocation.
-// This code is in the hot path for digesting incoming WAL, and is very performance sensitive.
-//
-pub fn decode_wal_record(
-    record: Bytes,
-    decoded: &mut DecodedWALRecord,
-    pg_version: u32,
-) -> Result<()> {
-    let mut rnode_spcnode: u32 = 0;
-    let mut rnode_dbnode: u32 = 0;
-    let mut rnode_relnode: u32 = 0;
-    let mut got_rnode = false;
-    let mut origin_id: u16 = 0;
-
-    let mut buf = record.clone();
-
-    // 1. Parse XLogRecord struct
-
-    // FIXME: assume little-endian here
-    let xlogrec = XLogRecord::from_bytes(&mut buf)?;
-
-    trace!(
-        "decode_wal_record xl_rmid = {} xl_info = {}",
-        xlogrec.xl_rmid,
-        xlogrec.xl_info
-    );
-
-    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
-
-    if buf.remaining() != remaining {
-        //TODO error
-    }
-
-    let mut max_block_id = 0;
-    let mut blocks_total_len: u32 = 0;
-    let mut main_data_len = 0;
-    let mut datatotal: u32 = 0;
-    decoded.blocks.clear();
-
-    // 2. Decode the headers.
-    // XLogRecordBlockHeaders if any,
-    // XLogRecordDataHeader[Short|Long]
-    while buf.remaining() > datatotal as usize {
-        let block_id = buf.get_u8();
-
-        match block_id {
-            pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
-                /* XLogRecordDataHeaderShort */
-                main_data_len = buf.get_u8() as u32;
-                datatotal += main_data_len;
-            }
-
-            pg_constants::XLR_BLOCK_ID_DATA_LONG => {
-                /* XLogRecordDataHeaderLong */
-                main_data_len = buf.get_u32_le();
-                datatotal += main_data_len;
-            }
-
-            pg_constants::XLR_BLOCK_ID_ORIGIN => {
-                // RepOriginId is uint16
-                origin_id = buf.get_u16_le();
-            }
-
-            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
-                // TransactionId is uint32
-                buf.advance(4);
-            }
-
-            0..=pg_constants::XLR_MAX_BLOCK_ID => {
-                /* XLogRecordBlockHeader */
-                let mut blk = DecodedBkpBlock::new();
-
-                if block_id <= max_block_id {
-                    // TODO
-                    //report_invalid_record(state,
-                    //			  "out-of-order block_id %u at %X/%X",
-                    //			  block_id,
-                    //			  (uint32) (state->ReadRecPtr >> 32),
-                    //			  (uint32) state->ReadRecPtr);
-                    //    goto err;
-                }
-                max_block_id = block_id;
-
-                let fork_flags: u8 = buf.get_u8();
-                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
-                blk.flags = fork_flags;
-                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
-                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
-                blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
-                blk.data_len = buf.get_u16_le();
-
-                /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
-
-                datatotal += blk.data_len as u32;
-                blocks_total_len += blk.data_len as u32;
-
-                if blk.has_image {
-                    blk.bimg_len = buf.get_u16_le();
-                    blk.hole_offset = buf.get_u16_le();
-                    blk.bimg_info = buf.get_u8();
-
-                    blk.apply_image = dispatch_pgversion!(
-                        pg_version,
-                        (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0
-                    );
-
-                    let blk_img_is_compressed =
-                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version);
-
-                    if blk_img_is_compressed {
-                        debug!("compressed block image , pg_version = {}", pg_version);
-                    }
-
-                    if blk_img_is_compressed {
-                        if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
-                            blk.hole_length = buf.get_u16_le();
-                        } else {
-                            blk.hole_length = 0;
-                        }
-                    } else {
-                        blk.hole_length = BLCKSZ - blk.bimg_len;
-                    }
-                    datatotal += blk.bimg_len as u32;
-                    blocks_total_len += blk.bimg_len as u32;
-
-                    /*
-                     * cross-check that hole_offset > 0, hole_length > 0 and
-                     * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
-                        && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
-                                      (unsigned int) blk->hole_offset,
-                                      (unsigned int) blk->hole_length,
-                                      (unsigned int) blk->bimg_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that hole_offset == 0 and hole_length == 0 if
-                     * the HAS_HOLE flag is not set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
-                        && (blk.hole_offset != 0 || blk.hole_length != 0)
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
-                                      (unsigned int) blk->hole_offset,
-                                      (unsigned int) blk->hole_length,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
-                     * flag is set.
-                     */
-                    if !blk_img_is_compressed && blk.bimg_len == BLCKSZ {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
-                                      (unsigned int) blk->bimg_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
-                     * IS_COMPRESSED flag is set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
-                        && !blk_img_is_compressed
-                        && blk.bimg_len != BLCKSZ
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
-                                      (unsigned int) blk->data_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-                }
-                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
-                    rnode_spcnode = buf.get_u32_le();
-                    rnode_dbnode = buf.get_u32_le();
-                    rnode_relnode = buf.get_u32_le();
-                    got_rnode = true;
-                } else if !got_rnode {
-                    // TODO
-                    /*
-                    report_invalid_record(state,
-                                    "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
-                                    (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                    goto err;           */
-                }
-
-                blk.rnode_spcnode = rnode_spcnode;
-                blk.rnode_dbnode = rnode_dbnode;
-                blk.rnode_relnode = rnode_relnode;
-
-                blk.blkno = buf.get_u32_le();
-                trace!(
-                    "this record affects {}/{}/{} blk {}",
-                    rnode_spcnode,
-                    rnode_dbnode,
-                    rnode_relnode,
-                    blk.blkno
-                );
-
-                decoded.blocks.push(blk);
-            }
-
-            _ => {
-                // TODO: invalid block_id
-            }
-        }
-    }
-
-    // 3. Decode blocks.
-    let mut ptr = record.len() - buf.remaining();
-    for blk in decoded.blocks.iter_mut() {
-        if blk.has_image {
-            blk.bimg_offset = ptr as u32;
-            ptr += blk.bimg_len as usize;
-        }
-        if blk.has_data {
-            ptr += blk.data_len as usize;
-        }
-    }
-    // We don't need them, so just skip blocks_total_len bytes
-    buf.advance(blocks_total_len as usize);
-    assert_eq!(ptr, record.len() - buf.remaining());
-
-    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
-
-    // 4. Decode main_data
-    if main_data_len > 0 {
-        assert_eq!(buf.remaining(), main_data_len as usize);
-    }
-
-    decoded.xl_xid = xlogrec.xl_xid;
-    decoded.xl_info = xlogrec.xl_info;
-    decoded.xl_rmid = xlogrec.xl_rmid;
-    decoded.record = record;
-    decoded.origin_id = origin_id;
-    decoded.main_data_offset = main_data_offset;
-
-    Ok(())
-}
-
-///
-/// Build a human-readable string to describe a WAL record
-///
-/// For debugging purposes
-pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
-    match rec {
-        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
-            "will_init: {}, {}",
-            will_init,
-            describe_postgres_wal_record(rec)?
-        )),
-        _ => Ok(format!("{:?}", rec)),
-    }
-}
-
-fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
+pub fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
     // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this.
     // Maybe use the postgres wal redo process, the same used for replaying WAL records?
     // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly,
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
new file mode 100644
index 0000000000..3f80f8fcdb
--- /dev/null
+++ b/libs/wal_decoder/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "wal_decoder"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+testing = []
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+pageserver_api.workspace = true
+postgres_ffi.workspace = true
+serde.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/libs/wal_decoder/src/decoder.rs
@@ -0,0 +1 @@
+
diff --git a/libs/wal_decoder/src/lib.rs b/libs/wal_decoder/src/lib.rs
new file mode 100644
index 0000000000..05349d17c9
--- /dev/null
+++ b/libs/wal_decoder/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod decoder;
+pub mod models;
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
new file mode 100644
index 0000000000..58f8e1b2da
--- /dev/null
+++ b/libs/wal_decoder/src/models.rs
@@ -0,0 +1,167 @@
+//! This module houses types which represent decoded PG WAL records
+//! ready for the pageserver to interpret. They are derived from the original
+//! WAL records, so that each struct corresponds closely to one WAL record of
+//! a specific kind. They contain the same information as the original WAL records,
+//! just decoded into structs and fields for easier access.
+//!
+//! The ingestion code uses these structs to help with parsing the WAL records,
+//! and it splits them into a stream of modifications to the key-value pairs that
+//! are ultimately stored in delta layers.  See also the split-out counterparts in
+//! [`postgres_ffi::walrecord`].
+//!
+//! The pipeline which processes WAL records is not super obvious, so let's follow
+//! the flow of an example XACT_COMMIT Postgres record:
+//!
+//! (Postgres XACT_COMMIT record)
+//! |
+//! |--> pageserver::walingest::WalIngest::decode_xact_record
+//!      |
+//!      |--> ([`XactRecord::Commit`])
+//!           |
+//!           |--> pageserver::walingest::WalIngest::ingest_xact_record
+//!                |
+//!                |--> (NeonWalRecord::ClogSetCommitted)
+//!                     |
+//!                     |--> write to KV store within the pageserver
+
+use bytes::Bytes;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::walrecord::{
+    XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
+    XlSmgrTruncate, XlXactParsedRecord,
+};
+use postgres_ffi::{Oid, TransactionId};
+use utils::lsn::Lsn;
+
+pub enum HeapamRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub struct ClearVmBits {
+    pub new_heap_blkno: Option<u32>,
+    pub old_heap_blkno: Option<u32>,
+    pub vm_rel: RelTag,
+    pub flags: u8,
+}
+
+pub enum NeonrmgrRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub enum SmgrRecord {
+    Create(SmgrCreate),
+    Truncate(XlSmgrTruncate),
+}
+
+pub struct SmgrCreate {
+    pub rel: RelTag,
+}
+
+pub enum DbaseRecord {
+    Create(DbaseCreate),
+    Drop(DbaseDrop),
+}
+
+pub struct DbaseCreate {
+    pub db_id: Oid,
+    pub tablespace_id: Oid,
+    pub src_db_id: Oid,
+    pub src_tablespace_id: Oid,
+}
+
+pub struct DbaseDrop {
+    pub db_id: Oid,
+    pub tablespace_ids: Vec<Oid>,
+}
+
+pub enum ClogRecord {
+    ZeroPage(ClogZeroPage),
+    Truncate(ClogTruncate),
+}
+
+pub struct ClogZeroPage {
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub struct ClogTruncate {
+    pub pageno: u32,
+    pub oldest_xid: TransactionId,
+    pub oldest_xid_db: Oid,
+}
+
+pub enum XactRecord {
+    Commit(XactCommon),
+    Abort(XactCommon),
+    CommitPrepared(XactCommon),
+    AbortPrepared(XactCommon),
+    Prepare(XactPrepare),
+}
+
+pub struct XactCommon {
+    pub parsed: XlXactParsedRecord,
+    pub origin_id: u16,
+    // Fields below are only used for logging
+    pub xl_xid: TransactionId,
+    pub lsn: Lsn,
+}
+
+pub struct XactPrepare {
+    pub xl_xid: TransactionId,
+    pub data: Bytes,
+}
+
+pub enum MultiXactRecord {
+    ZeroPage(MultiXactZeroPage),
+    Create(XlMultiXactCreate),
+    Truncate(XlMultiXactTruncate),
+}
+
+pub struct MultiXactZeroPage {
+    pub slru_kind: SlruKind,
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub enum RelmapRecord {
+    Update(RelmapUpdate),
+}
+
+pub struct RelmapUpdate {
+    pub update: XlRelmapUpdate,
+    pub buf: Bytes,
+}
+
+pub enum XlogRecord {
+    Raw(RawXlogRecord),
+}
+
+pub struct RawXlogRecord {
+    pub info: u8,
+    pub lsn: Lsn,
+    pub buf: Bytes,
+}
+
+pub enum LogicalMessageRecord {
+    Put(PutLogicalMessage),
+    #[cfg(feature = "testing")]
+    Failpoint,
+}
+
+pub struct PutLogicalMessage {
+    pub path: String,
+    pub buf: Bytes,
+}
+
+pub enum StandbyRecord {
+    RunningXacts(StandbyRunningXacts),
+}
+
+pub struct StandbyRunningXacts {
+    pub oldest_running_xid: TransactionId,
+}
+
+pub enum ReploriginRecord {
+    Set(XlReploriginSet),
+    Drop(XlReploriginDrop),
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2531abc7a1..ecb8fa7491 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing" ]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
 
 [dependencies]
 anyhow.workspace = true
@@ -83,6 +83,7 @@ enum-map.workspace = true
 enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
+wal_decoder.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index d98b23acce..0a1ad9cd6b 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -8,13 +8,12 @@ use pageserver::{
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
-    repository::Value,
     task_mgr::TaskKind,
     tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
-use pageserver_api::{key::Key, shard::TenantShardId};
+use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
 use utils::{
     bin_ser::BeSer,
     id::{TenantId, TimelineId},
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 1353e79f7c..5c5b52db44 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,9 +1,9 @@
 use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
-use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::key::Key;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 45936cb3fa..d3551b56e1 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -60,7 +60,8 @@ use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     future::Future,
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index bc939f9688..177e65ef79 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -51,7 +51,7 @@
 //!
 
 use anyhow::{Context, Result};
-use pageserver::repository::Key;
+use pageserver_api::key::Key;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 7dd2a5d05c..451d2a1d69 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -14,12 +14,12 @@ use std::ops::Range;
 use std::{fs, str};
 
 use pageserver::page_cache::{self, PAGE_SZ};
-use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
 use pageserver::virtual_file::{self, VirtualFile};
+use pageserver_api::key::{Key, KEY_SIZE};
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index c0b2b6ae89..22627d72c8 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -14,13 +14,13 @@ use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
-    repository::{Key, KEY_SIZE},
     tenant::{
         block_io::FileBlockReader, disk_btree::VisitDirection,
         storage_layer::delta_layer::DELTA_KEY_SIZE,
     },
     virtual_file::VirtualFile,
 };
+use pageserver_api::key::{Key, KEY_SIZE};
 use std::fs;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 73bdc90213..7733bdb640 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -696,7 +696,7 @@ impl DeletionQueue {
 mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
     use std::{io::ErrorKind, time::Duration};
     use tracing::info;
 
@@ -705,7 +705,6 @@ mod test {
 
     use crate::{
         controller_upcall_client::RetryForeverError,
-        repository::Key,
         tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3943f62ac0..2d8f4309ca 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    struct Key(crate::repository::Key);
+    struct Key(pageserver_api::key::Key);
 
     impl std::str::FromStr for Key {
         type Err = anyhow::Error;
 
         fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-            crate::repository::Key::from_hex(s).map(Key)
+            pageserver_api::key::Key::from_hex(s).map(Key)
         }
     }
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ca87f1d080..530c91c4da 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
-use crate::walrecord::decode_wal_record;
-use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use postgres_ffi::ControlFileData;
 use postgres_ffi::DBState_DB_SHUTDOWNED;
 use postgres_ffi::Oid;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index d51931c768..ef6711397a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -24,7 +24,6 @@ pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod repository;
 pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
@@ -32,7 +31,6 @@ pub mod tenant;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walrecord;
 pub mod walredo;
 
 use camino::Utf8Path;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 19233a28cc..dc2dc08b53 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,14 +7,14 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::walrecord::NeonWalRecord;
-use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use pageserver_api::key::Key;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
     relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -22,7 +22,9 @@ use pageserver_api::key::{
     CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7011ae9e63..8445601d29 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -92,11 +92,11 @@ use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
     TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
-use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::gc_result::GcResult;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
@@ -160,6 +160,7 @@ pub(crate) mod timeline;
 pub mod size;
 
 mod gc_block;
+mod gc_result;
 pub(crate) mod throttle;
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -467,10 +468,10 @@ impl WalRedoManager {
     /// This method is cancellation-safe.
     pub async fn request_redo(
         &self,
-        key: crate::repository::Key,
+        key: pageserver_api::key::Key,
         lsn: Lsn,
         base_img: Option<(Lsn, bytes::Bytes)>,
-        records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
+        records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
         pg_version: u32,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
@@ -4818,7 +4819,8 @@ pub(crate) mod harness {
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
-    use crate::{repository::Key, walrecord::NeonWalRecord};
+    use pageserver_api::key::Key;
+    use pageserver_api::record::NeonWalRecord;
 
     use super::*;
     use hex_literal::hex;
@@ -5087,25 +5089,30 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
-    use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
-    use crate::walrecord::NeonWalRecord;
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
+    use pageserver_api::value::Value;
     use rand::{thread_rng, Rng};
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    use timeline::{DeltaLayerTestDesc, GcInfo};
+    use timeline::DeltaLayerTestDesc;
     use utils::id::TenantId;
 
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
+    #[cfg(feature = "testing")]
+    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
+    #[cfg(feature = "testing")]
+    use timeline::GcInfo;
+
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
 
@@ -7670,6 +7677,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_neon_test_record() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_neon_test_record").await?;
@@ -7861,6 +7869,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
@@ -8057,6 +8066,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_generate_key_retention() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_generate_key_retention").await?;
@@ -8404,6 +8414,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
         let harness =
@@ -8644,6 +8655,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
     {
@@ -8852,6 +8864,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
@@ -9053,6 +9066,7 @@ mod tests {
     //
     // When querying the key range [A, B) we need to read at different LSN ranges
     // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs
new file mode 100644
index 0000000000..c805aafeab
--- /dev/null
+++ b/pageserver/src/tenant/gc_result.rs
@@ -0,0 +1,57 @@
+use anyhow::Result;
+use serde::Serialize;
+use std::ops::AddAssign;
+use std::time::Duration;
+
+///
+/// Result of performing GC
+///
+#[derive(Default, Serialize, Debug)]
+pub struct GcResult {
+    pub layers_total: u64,
+    pub layers_needed_by_cutoff: u64,
+    pub layers_needed_by_pitr: u64,
+    pub layers_needed_by_branches: u64,
+    pub layers_needed_by_leases: u64,
+    pub layers_not_updated: u64,
+    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+
+    #[serde(serialize_with = "serialize_duration_as_millis")]
+    pub elapsed: Duration,
+
+    /// The layers which were garbage collected.
+    ///
+    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
+    /// dropped in tests.
+    #[cfg(feature = "testing")]
+    #[serde(skip)]
+    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
+}
+
+// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
+fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    d.as_millis().serialize(serializer)
+}
+
+impl AddAssign for GcResult {
+    fn add_assign(&mut self, other: Self) {
+        self.layers_total += other.layers_total;
+        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
+        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
+        self.layers_needed_by_branches += other.layers_needed_by_branches;
+        self.layers_needed_by_leases += other.layers_needed_by_leases;
+        self.layers_not_updated += other.layers_not_updated;
+        self.layers_removed += other.layers_removed;
+
+        self.elapsed += other.elapsed;
+
+        #[cfg(feature = "testing")]
+        {
+            let mut other = other;
+            self.doomed_layers.append(&mut other.doomed_layers);
+        }
+    }
+}
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 707233b003..7f15baed10 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,9 +48,9 @@ mod layer_coverage;
 
 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
 use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 0567f8f3a7..a4c458b737 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2811,7 +2811,7 @@ where
 }
 
 use {
-    crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
+    crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
     utils::http::error::ApiError,
 };
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4a63491e90..8f4219bbbc 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -11,11 +11,11 @@ mod layer_name;
 pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Value;
-use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 272e422c90..8a397ceb7a 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
 use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use pageserver_api::value::Value;
 
 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 641729d681..10165b1d06 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -30,7 +30,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
-use crate::{walrecord, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -54,9 +53,11 @@ use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
                     // is it an image or will_init walrecord?
                     // FIXME: this could be handled by threading the BlobRef to the
                     // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(&data)
+                    let will_init = pageserver_api::value::ValueBytes::will_init(&data)
                         .inspect_err(|_e| {
                             #[cfg(feature = "testing")]
                             tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
                     format!(" img {} bytes", img.len())
                 }
                 Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
                     format!(
                         " rec {} bytes will_init: {} {}",
                         buf.len(),
@@ -1610,7 +1611,6 @@ pub(crate) mod test {
     use rand::RngCore;
 
     use super::*;
-    use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
@@ -1622,6 +1622,7 @@ pub(crate) mod test {
         DEFAULT_PG_VERSION,
     };
     use bytes::Bytes;
+    use pageserver_api::value::Value;
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
@@ -1974,8 +1975,8 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn copy_delta_prefix_smoke() {
-        use crate::walrecord::NeonWalRecord;
         use bytes::Bytes;
+        use pageserver_api::record::NeonWalRecord;
 
         let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
             .await
@@ -2198,6 +2199,7 @@ pub(crate) mod test {
         (k1, l1).cmp(&(k2, l2))
     }
 
+    #[cfg(feature = "testing")]
     pub(crate) fn sort_delta_value(
         (k1, l1, v1): &(Key, Lsn, Value),
         (k2, l2, v2): &(Key, Lsn, Value),
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index f45dd4b801..ccfcf68e8f 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -7,7 +7,7 @@ use pageserver_api::{
 };
 use utils::lsn::Lsn;
 
-use crate::repository::Value;
+use pageserver_api::value::Value;
 
 use super::merge_iterator::MergeIterator;
 
@@ -121,8 +121,8 @@ mod tests {
 
     #[tokio::test]
     async fn filter_keyspace_iterator() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
             .await
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3f90df312d..c0d183dc08 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -28,7 +28,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -51,8 +50,10 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -1125,6 +1126,7 @@ mod test {
     use pageserver_api::{
         key::Key,
         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+        value::Value,
     };
     use utils::{
         generation::Generation,
@@ -1134,7 +1136,6 @@ mod test {
 
     use crate::{
         context::RequestContext,
-        repository::Value,
         tenant::{
             config::TenantConf,
             harness::{TenantHarness, TIMELINE_ID},
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 7573ddb5cc..df448a0963 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,6 @@
 use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 9de70f14ee..36dcc8d805 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
 /// Also checks that the same does not happen on a non-evicted layer (regression test).
 #[tokio::test(start_paused = true)]
 async fn eviction_cancellation_on_drop() {
-    use crate::repository::Value;
     use bytes::Bytes;
+    use pageserver_api::value::Value;
 
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
         let mut writer = timeline.writer().await;
         writer
             .put(
-                crate::repository::Key::from_i128(5),
+                pageserver_api::key::Key::from_i128(5),
                 Lsn(0x20),
                 &Value::Image(Bytes::from_static(b"this does not matter either")),
                 &ctx,
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a30c25d780..2097e90764 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
 use utils::{id::TimelineId, lsn::Lsn};
 
-use crate::repository::Key;
+use pageserver_api::key::Key;
 
 use super::{DeltaLayerName, ImageLayerName, LayerName};
 
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 8e750e1187..2b98d74f9f 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -1,7 +1,7 @@
 //!
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
-use crate::repository::Key;
+use pageserver_api::key::Key;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt;
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index f91e27241d..980202f12c 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -7,7 +7,8 @@ use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
-use crate::{context::RequestContext, repository::Value};
+use crate::context::RequestContext;
+use pageserver_api::value::Value;
 
 use super::{
     delta_layer::{DeltaLayerInner, DeltaLayerIterator},
@@ -291,12 +292,16 @@ mod tests {
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
         },
-        walrecord::NeonWalRecord,
         DEFAULT_PG_VERSION,
     };
 
+    #[cfg(feature = "testing")]
+    use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
+
     async fn assert_merge_iter_equal(
         merge_iter: &mut MergeIterator<'_>,
         expect: &[(Key, Lsn, Value)],
@@ -319,8 +324,8 @@ mod tests {
 
     #[tokio::test]
     async fn merge_in_between() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_merge_in_between")
             .await
@@ -384,8 +389,8 @@ mod tests {
 
     #[tokio::test]
     async fn delta_merge() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_delta_merge")
             .await
@@ -458,10 +463,11 @@ mod tests {
         // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn delta_image_mixed_merge() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
             .await
@@ -586,5 +592,6 @@ mod tests {
         is_send(merge_iter);
     }
 
+    #[cfg(feature = "testing")]
     fn is_send(_: impl Send) {}
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f8d61dac5e..d765a7c987 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -125,11 +125,12 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
-use crate::repository::GcResult;
-use crate::repository::{Key, Value};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::gc_result::GcResult;
 use crate::ZERO_PAGE;
+use pageserver_api::key::Key;
+use pageserver_api::value::Value;
 
 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -5822,17 +5823,15 @@ fn is_send() {
 #[cfg(test)]
 mod tests {
     use pageserver_api::key::Key;
+    use pageserver_api::value::Value;
     use utils::{id::TimelineId, lsn::Lsn};
 
-    use crate::{
-        repository::Value,
-        tenant::{
-            harness::{test_img, TenantHarness},
-            layer_map::LayerMap,
-            storage_layer::{Layer, LayerName},
-            timeline::{DeltaLayerTestDesc, EvictionError},
-            Timeline,
-        },
+    use crate::tenant::{
+        harness::{test_img, TenantHarness},
+        layer_map::LayerMap,
+        storage_layer::{Layer, LayerName},
+        timeline::{DeltaLayerTestDesc, EvictionError},
+        Timeline,
     };
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 73e4f0e87c..70f93656cd 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
 };
 
-use crate::keyspace::KeySpace;
-use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
 
 use utils::lsn::Lsn;
 
@@ -2148,7 +2149,7 @@ struct ResidentDeltaLayer(ResidentLayer);
 struct ResidentImageLayer(ResidentLayer);
 
 impl CompactionJobExecutor for TimelineAdaptor {
-    type Key = crate::repository::Key;
+    type Key = pageserver_api::key::Key;
 
     type Layer = OwnArc<PersistentLayerDesc>;
     type DeltaLayer = ResidentDeltaLayer;
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index cee259e2e0..739fadbc6b 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,11 +31,11 @@ use crate::{
     task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
-    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9e43e10801..27b3f93845 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,8 +29,10 @@ use std::time::Instant;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
+use wal_decoder::models::*;
 
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
@@ -44,9 +46,9 @@ use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
-use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -108,143 +110,6 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
-// These structs are an intermediary representation of the PostgreSQL WAL records.
-// The ones prefixed with `Xl` are lower level, while the ones that are not have
-// all the required context to be acted upon by the pageserver.
-
-enum HeapamRecord {
-    ClearVmBits(ClearVmBits),
-}
-
-struct ClearVmBits {
-    new_heap_blkno: Option<u32>,
-    old_heap_blkno: Option<u32>,
-    vm_rel: RelTag,
-    flags: u8,
-}
-
-enum NeonrmgrRecord {
-    ClearVmBits(ClearVmBits),
-}
-
-enum SmgrRecord {
-    Create(SmgrCreate),
-    Truncate(XlSmgrTruncate),
-}
-
-struct SmgrCreate {
-    rel: RelTag,
-}
-
-enum DbaseRecord {
-    Create(DbaseCreate),
-    Drop(DbaseDrop),
-}
-
-struct DbaseCreate {
-    db_id: u32,
-    tablespace_id: u32,
-    src_db_id: u32,
-    src_tablespace_id: u32,
-}
-
-struct DbaseDrop {
-    db_id: u32,
-    tablespace_ids: Vec<u32>,
-}
-
-enum ClogRecord {
-    ZeroPage(ClogZeroPage),
-    Truncate(ClogTruncate),
-}
-
-struct ClogZeroPage {
-    segno: u32,
-    rpageno: u32,
-}
-
-struct ClogTruncate {
-    pageno: u32,
-    oldest_xid: u32,
-    oldest_xid_db: u32,
-}
-
-enum XactRecord {
-    Commit(XactCommon),
-    Abort(XactCommon),
-    CommitPrepared(XactCommon),
-    AbortPrepared(XactCommon),
-    Prepare(XactPrepare),
-}
-
-struct XactCommon {
-    parsed: XlXactParsedRecord,
-    origin_id: u16,
-    // Fields below are only used for logging
-    xl_xid: u32,
-    lsn: Lsn,
-}
-
-struct XactPrepare {
-    xl_xid: u32,
-    data: Bytes,
-}
-
-enum MultiXactRecord {
-    ZeroPage(MultiXactZeroPage),
-    Create(XlMultiXactCreate),
-    Truncate(XlMultiXactTruncate),
-}
-
-struct MultiXactZeroPage {
-    slru_kind: SlruKind,
-    segno: u32,
-    rpageno: u32,
-}
-
-enum RelmapRecord {
-    Update(RelmapUpdate),
-}
-
-struct RelmapUpdate {
-    update: XlRelmapUpdate,
-    buf: Bytes,
-}
-
-enum XlogRecord {
-    Raw(RawXlogRecord),
-}
-
-struct RawXlogRecord {
-    info: u8,
-    lsn: Lsn,
-    buf: Bytes,
-}
-
-enum LogicalMessageRecord {
-    Put(PutLogicalMessage),
-    #[cfg(feature = "testing")]
-    Failpoint,
-}
-
-struct PutLogicalMessage {
-    path: String,
-    buf: Bytes,
-}
-
-enum StandbyRecord {
-    RunningXacts(StandbyRunningXacts),
-}
-
-struct StandbyRunningXacts {
-    oldest_running_xid: u32,
-}
-
-enum ReploriginRecord {
-    Set(XlReploriginSet),
-    Drop(XlReploriginDrop),
-}
-
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
@@ -284,7 +149,6 @@ impl WalIngest {
     /// relations/pages that the record affects.
     ///
     /// This function returns `true` if the record was ingested, and `false` if it was filtered out
-    ///
     pub async fn ingest_record(
         &mut self,
         decoded: DecodedWALRecord,
@@ -2218,7 +2082,7 @@ impl WalIngest {
     ) -> anyhow::Result<Option<LogicalMessageRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-            let xlrec = crate::walrecord::XlLogicalMessage::decode(buf);
+            let xlrec = XlLogicalMessage::decode(buf);
             let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
 
             #[cfg(feature = "testing")]
@@ -2246,7 +2110,7 @@ impl WalIngest {
     ) -> anyhow::Result<Option<StandbyRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_RUNNING_XACTS {
-            let xlrec = crate::walrecord::XlRunningXacts::decode(buf);
+            let xlrec = XlRunningXacts::decode(buf);
             return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
                 oldest_running_xid: xlrec.oldest_running_xid,
             })));
@@ -2276,10 +2140,10 @@ impl WalIngest {
     ) -> anyhow::Result<Option<ReploriginRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_REPLORIGIN_SET {
-            let xlrec = crate::walrecord::XlReploriginSet::decode(buf);
+            let xlrec = XlReploriginSet::decode(buf);
             return Ok(Some(ReploriginRecord::Set(xlrec)));
         } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-            let xlrec = crate::walrecord::XlReploriginDrop::decode(buf);
+            let xlrec = XlReploriginDrop::decode(buf);
             return Ok(Some(ReploriginRecord::Drop(xlrec)));
         }
 
@@ -3146,6 +3010,7 @@ mod tests {
     async fn test_ingest_real_wal() {
         use crate::tenant::harness::*;
         use postgres_ffi::waldecoder::WalStreamDecoder;
+        use postgres_ffi::walrecord::decode_wal_record;
         use postgres_ffi::WAL_SEGMENT_SIZE;
 
         // Define test data path and constants.
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a1c9fc5651..027a6eb7d7 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -29,11 +29,11 @@ use crate::metrics::{
     WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
     WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
 };
-use crate::repository::Key;
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
+use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::sync::Arc;
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
 #[cfg(test)]
 mod tests {
     use super::PostgresRedoManager;
-    use crate::repository::Key;
-    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+    use crate::config::PageServerConf;
     use bytes::Bytes;
+    use pageserver_api::key::Key;
+    use pageserver_api::record::NeonWalRecord;
     use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use tracing::Instrument;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index c067787f97..7aaa357318 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,8 @@
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
             // No-op: this record will never be created in aux v2.
             warn!("AuxFile record should not be created in aux v2");
         }
-        #[cfg(test)]
+        #[cfg(feature = "testing")]
         NeonWalRecord::Test {
             append,
             clear,
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index f3197e68b5..7e9477cfbc 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -8,10 +8,10 @@ use crate::{
     metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
     page_cache::PAGE_SZ,
     span::debug_assert_current_span_has_tenant_id,
-    walrecord::NeonWalRecord,
 };
 use anyhow::Context;
 use bytes::Bytes;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
 use postgres_ffi::BLCKSZ;
 #[cfg(feature = "testing")]

From a73402e646f9840fca2712045c37c37ba848dfcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:41:53 +0100
Subject: [PATCH 19/27] Offloaded timeline deletion (#9519)

As pointed out in
https://github.com/neondatabase/neon/pull/9489#discussion_r1814699683 ,
we currently didn't support deletion for offloaded timelines after the
timeline has been loaded from the manifest instead of having been
offloaded.

This was because the upload queue hasn't been initialized yet. This PR
thus initializes the timeline and shuts it down immediately.

Part of #8088
---
 pageserver/src/tenant.rs                     | 15 +---
 pageserver/src/tenant/timeline/delete.rs     | 31 ++++++-
 test_runner/regress/test_timeline_archive.py | 88 +++++++++++++-------
 3 files changed, 88 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8445601d29..7f8af67c2c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -626,19 +626,10 @@ impl TimelineOrOffloaded {
             TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
         }
     }
-    fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
+    fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
         match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
-            TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
-                Some(remote_client) => remote_client,
-                None => {
-                    let remote_client = tenant.build_timeline_client(
-                        offloaded.timeline_id,
-                        tenant.remote_storage.clone(),
-                    );
-                    Arc::new(remote_client)
-                }
-            },
+            TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
+            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index a664bb59e1..53b65da515 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, Instrument};
+use tracing::{error, info, info_span, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
@@ -15,7 +15,7 @@ use crate::{
     tenant::{
         metadata::TimelineMetadata,
         remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+            self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
         },
         CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
     },
@@ -258,7 +258,32 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        let remote_client = timeline.remote_client_maybe_construct(tenant);
+        let remote_client = match timeline.maybe_remote_client() {
+            Some(remote_client) => remote_client,
+            None => {
+                let remote_client = tenant
+                    .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
+                let result = remote_client
+                    .download_index_file(&tenant.cancel)
+                    .instrument(info_span!("download_index_file"))
+                    .await
+                    .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
+                let index_part = match result {
+                    MaybeDeletedIndexPart::Deleted(p) => {
+                        tracing::info!("Timeline already set as deleted in remote index");
+                        p
+                    }
+                    MaybeDeletedIndexPart::IndexPart(p) => p,
+                };
+                let remote_client = Arc::new(remote_client);
+
+                remote_client
+                    .init_upload_queue(&index_part)
+                    .map_err(DeleteTimelineError::Other)?;
+                remote_client.shutdown().await;
+                remote_client
+            }
+        };
         set_deleted_in_remote_index(&remote_client).await?;
 
         fail::fail_point!("timeline-delete-before-schedule", |_| {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index cb8724dd1c..77efd7b749 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         }
     )
 
-    # Create two branches and archive them
-    parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
-    leaf_timeline_id = env.create_branch(
-        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    # Create three branches that depend on each other, starting with two
+    grandparent_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_grandparent", tenant_id
+    )
+    parent_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
     )
 
+    # write some stuff to the parent
     with env.endpoints.create_start(
-        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+        "test_ancestor_branch_archive_parent", tenant_id=tenant_id
     ) as endpoint:
         endpoint.safe_psql_many(
             [
@@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         )
         sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
 
+    # create the third branch
+    leaf_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    )
+
     ps_http.timeline_archival_config(
         tenant_id,
         leaf_timeline_id,
@@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         state=TimelineArchivalState.ARCHIVED,
     )
 
+    ps_http.timeline_archival_config(
+        tenant_id,
+        grandparent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
     def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
         return (
             env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
@@ -201,30 +215,34 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
 
     ps_http.timeline_archival_config(
         tenant_id,
-        parent_timeline_id,
+        grandparent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
     ps_http.timeline_archival_config(
         tenant_id,
-        leaf_timeline_id,
+        parent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
-    leaf_detail = ps_http.timeline_detail(
+    parent_detail = ps_http.timeline_detail(
         tenant_id,
-        leaf_timeline_id,
+        parent_timeline_id,
     )
-    assert leaf_detail["is_archived"] is False
+    assert parent_detail["is_archived"] is False
 
     with env.endpoints.create_start(
-        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+        "test_ancestor_branch_archive_parent", tenant_id=tenant_id
     ) as endpoint:
         sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
         assert sum == sum_again
 
+    # Test that deletion of offloaded timelines works
+    ps_http.timeline_delete(tenant_id, leaf_timeline_id)
+
     assert not timeline_offloaded_logged(initial_timeline_id)
 
 
-def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("delete_timeline", [False, True])
+def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
     """
     Test for persistence of timeline offload state
     """
@@ -306,27 +324,35 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
     assert timeline_offloaded_api(child_timeline_id)
     assert not timeline_offloaded_api(root_timeline_id)
 
-    ps_http.timeline_archival_config(
-        tenant_id,
-        child_timeline_id,
-        state=TimelineArchivalState.UNARCHIVED,
-    )
-    child_detail = ps_http.timeline_detail(
-        tenant_id,
-        child_timeline_id,
-    )
-    assert child_detail["is_archived"] is False
+    if delete_timeline:
+        ps_http.timeline_delete(tenant_id, child_timeline_id)
+        with pytest.raises(PageserverApiException, match="not found"):
+            ps_http.timeline_detail(
+                tenant_id,
+                child_timeline_id,
+            )
+    else:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+        child_detail = ps_http.timeline_detail(
+            tenant_id,
+            child_timeline_id,
+        )
+        assert child_detail["is_archived"] is False
 
-    with env.endpoints.create_start(
-        "test_archived_branch_persisted", tenant_id=tenant_id
-    ) as endpoint:
-        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
-        assert sum == sum_again
+        with env.endpoints.create_start(
+            "test_archived_branch_persisted", tenant_id=tenant_id
+        ) as endpoint:
+            sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+            assert sum == sum_again
 
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
-    )
+        assert_prefix_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
+        )
 
     assert not timeline_offloaded_api(root_timeline_id)
 

From 45b558f480e76f46a61eb97504931c8bd211457b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 29 Oct 2024 11:53:28 +0100
Subject: [PATCH 20/27] temporarily increase timeout for clickbench benchmark
 until regression is resolved (#9554)

## Problem

click bench job in benchmarking workflow has a performance regression
causing it to run in timeout of max job run.

Suspected root cause:
Project has been migrated from single pageserver to storage controller
managed project on Oct 14th.
Since then the regression shows.

## Summary of changes

Increase timeout of pytest to 12 hours.
Increase job timeout to 12 hours
---
 .github/workflows/benchmarking.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 5ccfe48684..69b8bc5d70 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -671,6 +671,10 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
+    # Increase timeout to 12h, default timeout is 6h
+    # we have regression in clickbench causing it to run 2-3x longer
+    timeout-minutes: 720
+
     steps:
     - uses: actions/checkout@v4
 
@@ -716,7 +720,7 @@ jobs:
         test_selection: performance/test_perf_olap.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

From 47c35f67c392a9642a4f0ccaeb326a53913449e4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 29 Oct 2024 11:01:09 +0000
Subject: [PATCH 21/27] [proxy]: fix JWT handling for AWS cognito. (#9536)

In the base64 payload of an aws cognito jwt, I saw the following:

```
"iss":"https:\/\/cognito-idp.us-west-2.amazonaws.com\/us-west-2_redacted"
```

issuers are supposed to be URLs, and URLs are always valid un-escaped
JSON. However, `\/` is a valid escape character so what AWS is doing is
technically correct... sigh...

This PR refactors the test suite and adds a new regression test for
cognito.
---
 proxy/src/auth/backend/jwt.rs | 508 +++++++++++++++++++++++++---------
 1 file changed, 383 insertions(+), 125 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 69ab4b8ccb..83c3617612 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::future::Future;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
@@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError {
     RoleJwksNotConfigured,
 }
 
+#[derive(Clone)]
 pub(crate) struct AuthRule {
     pub(crate) id: String,
     pub(crate) jwks_url: url::Url,
@@ -277,7 +279,7 @@ impl JwkCacheEntryLock {
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
         let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid, role_name) {
+            match guard.find_jwk_and_audience(&kid, role_name) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
                     let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
@@ -312,7 +314,9 @@ impl JwkCacheEntryLock {
 
         if let Some(aud) = expected_audience {
             if payload.audience.0.iter().all(|s| s != aud) {
-                return Err(JwtError::InvalidJwtTokenAudience);
+                return Err(JwtError::InvalidClaims(
+                    JwtClaimsError::InvalidJwtTokenAudience,
+                ));
             }
         }
 
@@ -320,13 +324,15 @@ impl JwkCacheEntryLock {
 
         if let Some(exp) = payload.expiration {
             if now >= exp + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::JwtTokenHasExpired);
+                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
             }
         }
 
         if let Some(nbf) = payload.not_before {
             if nbf >= now + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::JwtTokenNotYetReadyToUse);
+                return Err(JwtError::InvalidClaims(
+                    JwtClaimsError::JwtTokenNotYetReadyToUse,
+                ));
             }
         }
 
@@ -420,8 +426,8 @@ struct JwtHeader<'a> {
     #[serde(rename = "alg")]
     algorithm: jose_jwa::Algorithm,
     /// key id, must be provided for our usecase
-    #[serde(rename = "kid")]
-    key_id: Option<&'a str>,
+    #[serde(rename = "kid", borrow)]
+    key_id: Option<Cow<'a, str>>,
 }
 
 /// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
@@ -440,17 +446,17 @@ struct JwtPayload<'a> {
 
     // the following entries are only extracted for the sake of debug logging.
     /// Issuer of the JWT
-    #[serde(rename = "iss")]
-    issuer: Option<&'a str>,
+    #[serde(rename = "iss", borrow)]
+    issuer: Option<Cow<'a, str>>,
     /// Subject of the JWT (the user)
-    #[serde(rename = "sub")]
-    subject: Option<&'a str>,
+    #[serde(rename = "sub", borrow)]
+    subject: Option<Cow<'a, str>>,
     /// Unique token identifier
-    #[serde(rename = "jti")]
-    jwt_id: Option<&'a str>,
+    #[serde(rename = "jti", borrow)]
+    jwt_id: Option<Cow<'a, str>>,
     /// Unique session identifier
-    #[serde(rename = "sid")]
-    session_id: Option<&'a str>,
+    #[serde(rename = "sid", borrow)]
+    session_id: Option<Cow<'a, str>>,
 }
 
 /// `OneOrMany` supports parsing either a single item or an array of items.
@@ -585,14 +591,8 @@ pub(crate) enum JwtError {
     #[error("Provided authentication token is not a valid JWT encoding")]
     JwtEncoding(#[from] JwtEncodingError),
 
-    #[error("invalid JWT token audience")]
-    InvalidJwtTokenAudience,
-
-    #[error("JWT token has expired")]
-    JwtTokenHasExpired,
-
-    #[error("JWT token is not yet ready to use")]
-    JwtTokenNotYetReadyToUse,
+    #[error(transparent)]
+    InvalidClaims(#[from] JwtClaimsError),
 
     #[error("invalid P256 key")]
     InvalidP256Key(jose_jwk::crypto::Error),
@@ -644,6 +644,19 @@ pub enum JwtEncodingError {
     InvalidCompactForm,
 }
 
+#[derive(Error, Debug, PartialEq)]
+#[non_exhaustive]
+pub enum JwtClaimsError {
+    #[error("invalid JWT token audience")]
+    InvalidJwtTokenAudience,
+
+    #[error("JWT token has expired")]
+    JwtTokenHasExpired,
+
+    #[error("JWT token is not yet ready to use")]
+    JwtTokenNotYetReadyToUse,
+}
+
 #[allow(dead_code, reason = "Debug use only")]
 #[derive(Debug)]
 pub(crate) enum KeyType {
@@ -680,6 +693,8 @@ mod tests {
     use hyper_util::rt::TokioIo;
     use rand::rngs::OsRng;
     use rsa::pkcs8::DecodePrivateKey;
+    use serde::Serialize;
+    use serde_json::json;
     use signature::Signer;
     use tokio::net::TcpListener;
 
@@ -693,6 +708,7 @@ mod tests {
             key: jose_jwk::Key::Ec(pk),
             prm: jose_jwk::Parameters {
                 kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
                 ..Default::default()
             },
         };
@@ -706,24 +722,47 @@ mod tests {
             key: jose_jwk::Key::Rsa(pk),
             prm: jose_jwk::Parameters {
                 kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
                 ..Default::default()
             },
         };
         (sk, jwk)
     }
 
+    fn now() -> u64 {
+        SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs()
+    }
+
     fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let now = now();
+        let body = typed_json::json! {{
+            "exp": now + 3600,
+            "nbf": now,
+            "aud": ["audience1", "neon", "audience2"],
+            "sub": "user1",
+            "sid": "session1",
+            "jti": "token1",
+            "iss": "neon-testing",
+        }};
+        build_custom_jwt_payload(kid, body, sig)
+    }
+
+    fn build_custom_jwt_payload(
+        kid: String,
+        body: impl Serialize,
+        sig: jose_jwa::Signing,
+    ) -> String {
         let header = JwtHeader {
             algorithm: jose_jwa::Algorithm::Signing(sig),
-            key_id: Some(&kid),
+            key_id: Some(Cow::Owned(kid)),
         };
-        let body = typed_json::json! {{
-            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
-        }};
 
         let header =
             base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
-        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
 
         format!("{header}.{body}")
     }
@@ -738,6 +777,16 @@ mod tests {
         format!("{payload}.{sig}")
     }
 
+    fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
     fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
         use rsa::pkcs1v15::SigningKey;
         use rsa::signature::SignatureEncoding;
@@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 -----END PRIVATE KEY-----
 ";
 
-    #[tokio::test]
-    async fn renew() {
-        let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
-        let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
-        let (ec1, jwk3) = new_ec_jwk("3".into());
-        let (ec2, jwk4) = new_ec_jwk("4".into());
+    #[derive(Clone)]
+    struct Fetch(Vec<AuthRule>);
 
-        let foo_jwks = jose_jwk::JwkSet {
-            keys: vec![jwk1, jwk3],
-        };
-        let bar_jwks = jose_jwk::JwkSet {
-            keys: vec![jwk2, jwk4],
-        };
+    impl FetchAuthRules for Fetch {
+        async fn fetch_auth_rules(
+            &self,
+            _ctx: &RequestMonitoring,
+            _endpoint: EndpointId,
+        ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
+            Ok(self.0.clone())
+        }
+    }
 
+    async fn jwks_server(
+        router: impl for<'a> Fn(&'a str) -> Option<Vec<u8>> + Send + Sync + 'static,
+    ) -> SocketAddr {
+        let router = Arc::new(router);
         let service = service_fn(move |req| {
-            let foo_jwks = foo_jwks.clone();
-            let bar_jwks = bar_jwks.clone();
+            let router = Arc::clone(&router);
             async move {
-                let jwks = match req.uri().path() {
-                    "/foo" => &foo_jwks,
-                    "/bar" => &bar_jwks,
-                    _ => {
-                        return Response::builder()
-                            .status(404)
-                            .body(Full::new(Bytes::new()));
-                    }
-                };
-                let body = serde_json::to_vec(jwks).unwrap();
-                Response::builder()
-                    .status(200)
-                    .body(Full::new(Bytes::from(body)))
+                match router(req.uri().path()) {
+                    Some(body) => Response::builder()
+                        .status(200)
+                        .body(Full::new(Bytes::from(body))),
+                    None => Response::builder()
+                        .status(404)
+                        .body(Full::new(Bytes::new())),
+                }
             }
         });
 
@@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
             }
         });
 
-        let client = reqwest::Client::new();
+        addr
+    }
 
-        #[derive(Clone)]
-        struct Fetch(SocketAddr, Vec<RoleNameInt>);
+    #[tokio::test]
+    async fn check_jwt_happy_path() {
+        let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into());
+        let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into());
+        let (ec1, jwk3) = new_ec_jwk("ec1".into());
+        let (ec2, jwk4) = new_ec_jwk("ec2".into());
 
-        impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(
-                &self,
-                _ctx: &RequestMonitoring,
-                _endpoint: EndpointId,
-            ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
-                Ok(vec![
-                    AuthRule {
-                        id: "foo".to_owned(),
-                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
-                        audience: None,
-                        role_names: self.1.clone(),
-                    },
-                    AuthRule {
-                        id: "bar".to_owned(),
-                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
-                        audience: None,
-                        role_names: self.1.clone(),
-                    },
-                ])
-            }
-        }
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let jwks_addr = jwks_server(move |path| match path {
+            "/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()),
+            "/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()),
+            _ => None,
+        })
+        .await;
 
         let role_name1 = RoleName::from("anonymous");
         let role_name2 = RoleName::from("authenticated");
 
-        let fetch = Fetch(
-            addr,
-            vec![
-                RoleNameInt::from(&role_name1),
-                RoleNameInt::from(&role_name2),
-            ],
-        );
+        let roles = vec![
+            RoleNameInt::from(&role_name1),
+            RoleNameInt::from(&role_name2),
+        ];
+        let rules = vec![
+            AuthRule {
+                id: "foo".to_owned(),
+                jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(),
+                audience: None,
+                role_names: roles.clone(),
+            },
+            AuthRule {
+                id: "bar".to_owned(),
+                jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(),
+                audience: None,
+                role_names: roles.clone(),
+            },
+        ];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
 
         let endpoint = EndpointId::from("ep");
 
-        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
-
-        let jwt1 = new_rsa_jwt("1".into(), rs1);
-        let jwt2 = new_rsa_jwt("2".into(), rs2);
-        let jwt3 = new_ec_jwt("3".into(), &ec1);
-        let jwt4 = new_ec_jwt("4".into(), &ec2);
-
-        // had the wrong kid, therefore will have the wrong ecdsa signature
-        let bad_jwt = new_ec_jwt("3".into(), &ec2);
-        // this role_name is not accepted
-        let bad_role_name = RoleName::from("cloud_admin");
-
-        let err = jwk_cache
-            .check_jwt(
-                &RequestMonitoring::test(),
-                &bad_jwt,
-                &client,
-                endpoint.clone(),
-                &role_name1,
-                &fetch,
-            )
-            .await
-            .unwrap_err();
-        assert!(err.to_string().contains("signature error"));
-
-        let err = jwk_cache
-            .check_jwt(
-                &RequestMonitoring::test(),
-                &jwt1,
-                &client,
-                endpoint.clone(),
-                &bad_role_name,
-                &fetch,
-            )
-            .await
-            .unwrap_err();
-        assert!(err.to_string().contains("jwk not found"));
+        let jwt1 = new_rsa_jwt("rs1".into(), rs1);
+        let jwt2 = new_rsa_jwt("rs2".into(), rs2);
+        let jwt3 = new_ec_jwt("ec1".into(), &ec1);
+        let jwt4 = new_ec_jwt("ec2".into(), &ec2);
 
         let tokens = [jwt1, jwt2, jwt3, jwt4];
         let role_names = [role_name1, role_name2];
@@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                 jwk_cache
                     .check_jwt(
                         &RequestMonitoring::test(),
-                        token,
-                        &client,
                         endpoint.clone(),
                         role,
                         &fetch,
+                        token,
                     )
                     .await
                     .unwrap();
             }
         }
     }
+
+    /// AWS Cognito escapes the `/` in the URL.
+    #[tokio::test]
+    async fn check_jwt_regression_cognito_issuer() {
+        let (key, jwk) = new_ec_jwk("key".into());
+
+        let now = now();
+        let token = new_custom_ec_jwt(
+            "key".into(),
+            &key,
+            typed_json::json! {{
+                "sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
+                // cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse
+                // to write that escape character. instead I will make a bogus URL using `\` instead.
+                "iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh",
+                "client_id": "abcdefghijklmnopqrstuvwxyz",
+                "origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893",
+                "event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065",
+                "token_use": "access",
+                "scope": "aws.cognito.signin.user.admin",
+                "auth_time":now,
+                "exp":now + 60,
+                "iat":now,
+                "jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0",
+                "username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
+            }},
+        );
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+
+        let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await;
+
+        let role_name = RoleName::from("anonymous");
+        let rules = vec![AuthRule {
+            id: "aws-cognito".to_owned(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role_name)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let endpoint = EndpointId::from("ep");
+
+        jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                endpoint.clone(),
+                &role_name,
+                &fetch,
+                &token,
+            )
+            .await
+            .unwrap();
+    }
+
+    #[tokio::test]
+    async fn check_jwt_invalid_signature() {
+        let (_, jwk) = new_ec_jwk("1".into());
+        let (key, _) = new_ec_jwk("1".into());
+
+        // has a matching kid, but signed by the wrong key
+        let bad_jwt = new_ec_jwt("1".into(), &key);
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let role = RoleName::from("authenticated");
+
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        let ctx = RequestMonitoring::test();
+        let err = jwk_cache
+            .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
+            .await
+            .unwrap_err();
+        assert!(
+            matches!(err, JwtError::Signature(_)),
+            "expected \"signature error\", got {err:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn check_jwt_unknown_role() {
+        let (key, jwk) = new_rsa_jwk(RS1, "1".into());
+        let jwt = new_rsa_jwt("1".into(), key);
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let role = RoleName::from("authenticated");
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        // this role_name is not accepted
+        let bad_role_name = RoleName::from("cloud_admin");
+
+        let ctx = RequestMonitoring::test();
+        let err = jwk_cache
+            .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
+            .await
+            .unwrap_err();
+
+        assert!(
+            matches!(err, JwtError::JwkNotFound),
+            "expected \"jwk not found\", got {err:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn check_jwt_invalid_claims() {
+        let (key, jwk) = new_ec_jwk("1".into());
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let now = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs();
+
+        struct Test {
+            body: serde_json::Value,
+            error: JwtClaimsError,
+        }
+
+        let table = vec![
+            Test {
+                body: json! {{
+                    "nbf": now + 60,
+                    "aud": "neon",
+                }},
+                error: JwtClaimsError::JwtTokenNotYetReadyToUse,
+            },
+            Test {
+                body: json! {{
+                    "exp": now - 60,
+                    "aud": ["neon"],
+                }},
+                error: JwtClaimsError::JwtTokenHasExpired,
+            },
+            Test {
+                body: json! {{
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": [],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": "foo",
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": ["foo"],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": ["foo", "bar"],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+        ];
+
+        let role = RoleName::from("authenticated");
+
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: Some("neon".to_string()),
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        let ctx = RequestMonitoring::test();
+        for test in table {
+            let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
+
+            match jwk_cache
+                .check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt)
+                .await
+            {
+                Err(JwtError::InvalidClaims(error)) if error == test.error => {}
+                Err(err) => {
+                    panic!("expected {:?}, got {err:?}", test.error)
+                }
+                Ok(_payload) => {
+                    panic!("expected {:?}, got ok", test.error)
+                }
+            }
+        }
+    }
 }

From d4cbc8cfeb433733d312d8761c3f3bab816df04e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 29 Oct 2024 11:39:09 +0000
Subject: [PATCH 22/27] [auth_broker]: regress test (#9541)

python based regression test setup for auth_broker. This uses a http
mock for cplane as well as the JWKs url.

complications:
1. We cannot just use local_proxy binary, as that requires the
pg_session_jwt extension which we don't have available in the current
test suite
2. We cannot use just any old http mock for local_proxy, as auth_broker
requires http2 to local_proxy

as such, I used the h2 library to implement an echo server - copied from
the examples in the h2 docs.
---
 poetry.lock                             |  45 +++--
 pyproject.toml                          |   3 +
 test_runner/conftest.py                 |   1 +
 test_runner/fixtures/h2server.py        | 198 ++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py   | 239 +++++++++++++++++++++---
 test_runner/regress/test_auth_broker.py |  37 ++++
 test_runner/stubs/h2/README.md          |   1 +
 test_runner/stubs/h2/__init__.pyi       |   0
 test_runner/stubs/h2/config.pyi         |  42 +++++
 test_runner/stubs/h2/connection.pyi     | 142 ++++++++++++++
 test_runner/stubs/h2/errors.pyi         |  17 ++
 test_runner/stubs/h2/events.pyi         | 106 +++++++++++
 test_runner/stubs/h2/exceptions.pyi     |  48 +++++
 test_runner/stubs/h2/frame_buffer.pyi   |  19 ++
 test_runner/stubs/h2/settings.pyi       |  61 ++++++
 test_runner/stubs/h2/stream.pyi         | 184 ++++++++++++++++++
 test_runner/stubs/h2/utilities.pyi      |  25 +++
 test_runner/stubs/h2/windows.pyi        |  13 ++
 18 files changed, 1143 insertions(+), 38 deletions(-)
 create mode 100644 test_runner/fixtures/h2server.py
 create mode 100644 test_runner/regress/test_auth_broker.py
 create mode 100644 test_runner/stubs/h2/README.md
 create mode 100644 test_runner/stubs/h2/__init__.pyi
 create mode 100644 test_runner/stubs/h2/config.pyi
 create mode 100644 test_runner/stubs/h2/connection.pyi
 create mode 100644 test_runner/stubs/h2/errors.pyi
 create mode 100644 test_runner/stubs/h2/events.pyi
 create mode 100644 test_runner/stubs/h2/exceptions.pyi
 create mode 100644 test_runner/stubs/h2/frame_buffer.pyi
 create mode 100644 test_runner/stubs/h2/settings.pyi
 create mode 100644 test_runner/stubs/h2/stream.pyi
 create mode 100644 test_runner/stubs/h2/utilities.pyi
 create mode 100644 test_runner/stubs/h2/windows.pyi

diff --git a/poetry.lock b/poetry.lock
index 7abd794235..36ea82a446 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1521,6 +1521,21 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "jwcrypto"
+version = "1.5.6"
+description = "Implementation of JOSE Web standards"
+optional = false
+python-versions = ">= 3.8"
+files = [
+    {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
+    {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
+]
+
+[package.dependencies]
+cryptography = ">=3.4"
+typing-extensions = ">=4.5.0"
+
 [[package]]
 name = "kafka-python"
 version = "2.0.2"
@@ -2111,7 +2126,6 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2120,8 +2134,6 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2603,7 +2615,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2912,6 +2923,20 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "types-jwcrypto"
+version = "1.5.0.20240925"
+description = "Typing stubs for jwcrypto"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
+    {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
+]
+
+[package.dependencies]
+cryptography = "*"
+
 [[package]]
 name = "types-psutil"
 version = "5.9.5.12"
@@ -3159,16 +3184,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3406,4 +3421,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
+content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"
diff --git a/pyproject.toml b/pyproject.toml
index d4926cfb9a..faa5f9123c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,9 @@ pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 kafka-python = "^2.0.2"
+jwcrypto = "^1.5.6"
+h2 = "^4.1.0"
+types-jwcrypto = "^1.5.0.20240925"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4a3194c691..84eda52d33 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 pytest_plugins = (
     "fixtures.pg_version",
     "fixtures.parametrize",
+    "fixtures.h2server",
     "fixtures.httpserver",
     "fixtures.compute_reconfigure",
     "fixtures.storage_controller_proxy",
diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py
new file mode 100644
index 0000000000..92783e1fb2
--- /dev/null
+++ b/test_runner/fixtures/h2server.py
@@ -0,0 +1,198 @@
+"""
+https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
+
+auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
+"""
+
+import asyncio
+import collections
+import io
+import json
+from collections.abc import AsyncIterable
+
+import pytest_asyncio
+from h2.config import H2Configuration
+from h2.connection import H2Connection
+from h2.errors import ErrorCodes
+from h2.events import (
+    ConnectionTerminated,
+    DataReceived,
+    RemoteSettingsChanged,
+    RequestReceived,
+    StreamEnded,
+    StreamReset,
+    WindowUpdated,
+)
+from h2.exceptions import ProtocolError, StreamClosedError
+from h2.settings import SettingCodes
+
+RequestData = collections.namedtuple("RequestData", ["headers", "data"])
+
+
+class H2Server:
+    def __init__(self, host, port) -> None:
+        self.host = host
+        self.port = port
+
+
+class H2Protocol(asyncio.Protocol):
+    def __init__(self):
+        config = H2Configuration(client_side=False, header_encoding="utf-8")
+        self.conn = H2Connection(config=config)
+        self.transport = None
+        self.stream_data = {}
+        self.flow_control_futures = {}
+
+    def connection_made(self, transport: asyncio.Transport):  # type: ignore[override]
+        self.transport = transport
+        self.conn.initiate_connection()
+        self.transport.write(self.conn.data_to_send())
+
+    def connection_lost(self, _exc):
+        for future in self.flow_control_futures.values():
+            future.cancel()
+        self.flow_control_futures = {}
+
+    def data_received(self, data: bytes):
+        assert self.transport is not None
+        try:
+            events = self.conn.receive_data(data)
+        except ProtocolError:
+            self.transport.write(self.conn.data_to_send())
+            self.transport.close()
+        else:
+            self.transport.write(self.conn.data_to_send())
+            for event in events:
+                if isinstance(event, RequestReceived):
+                    self.request_received(event.headers, event.stream_id)
+                elif isinstance(event, DataReceived):
+                    self.receive_data(event.data, event.stream_id)
+                elif isinstance(event, StreamEnded):
+                    self.stream_complete(event.stream_id)
+                elif isinstance(event, ConnectionTerminated):
+                    self.transport.close()
+                elif isinstance(event, StreamReset):
+                    self.stream_reset(event.stream_id)
+                elif isinstance(event, WindowUpdated):
+                    self.window_updated(event.stream_id, event.delta)
+                elif isinstance(event, RemoteSettingsChanged):
+                    if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
+                        self.window_updated(None, 0)
+
+                self.transport.write(self.conn.data_to_send())
+
+    def request_received(self, headers: list[tuple[str, str]], stream_id: int):
+        headers_map = collections.OrderedDict(headers)
+
+        # Store off the request data.
+        request_data = RequestData(headers_map, io.BytesIO())
+        self.stream_data[stream_id] = request_data
+
+    def stream_complete(self, stream_id: int):
+        """
+        When a stream is complete, we can send our response.
+        """
+        try:
+            request_data = self.stream_data[stream_id]
+        except KeyError:
+            # Just return, we probably 405'd this already
+            return
+
+        headers = request_data.headers
+        body = request_data.data.getvalue().decode("utf-8")
+
+        data = json.dumps({"headers": headers, "body": body}, indent=4).encode("utf8")
+
+        response_headers = (
+            (":status", "200"),
+            ("content-type", "application/json"),
+            ("content-length", str(len(data))),
+        )
+        self.conn.send_headers(stream_id, response_headers)
+        asyncio.ensure_future(self.send_data(data, stream_id))
+
+    def receive_data(self, data: bytes, stream_id: int):
+        """
+        We've received some data on a stream. If that stream is one we're
+        expecting data on, save it off. Otherwise, reset the stream.
+        """
+        try:
+            stream_data = self.stream_data[stream_id]
+        except KeyError:
+            self.conn.reset_stream(stream_id, error_code=ErrorCodes.PROTOCOL_ERROR)
+        else:
+            stream_data.data.write(data)
+
+    def stream_reset(self, stream_id):
+        """
+        A stream reset was sent. Stop sending data.
+        """
+        if stream_id in self.flow_control_futures:
+            future = self.flow_control_futures.pop(stream_id)
+            future.cancel()
+
+    async def send_data(self, data, stream_id):
+        """
+        Send data according to the flow control rules.
+        """
+        while data:
+            while self.conn.local_flow_control_window(stream_id) < 1:
+                try:
+                    await self.wait_for_flow_control(stream_id)
+                except asyncio.CancelledError:
+                    return
+
+            chunk_size = min(
+                self.conn.local_flow_control_window(stream_id),
+                len(data),
+                self.conn.max_outbound_frame_size,
+            )
+
+            try:
+                self.conn.send_data(
+                    stream_id, data[:chunk_size], end_stream=(chunk_size == len(data))
+                )
+            except (StreamClosedError, ProtocolError):
+                # The stream got closed and we didn't get told. We're done
+                # here.
+                break
+
+            assert self.transport is not None
+            self.transport.write(self.conn.data_to_send())
+            data = data[chunk_size:]
+
+    async def wait_for_flow_control(self, stream_id):
+        """
+        Waits for a Future that fires when the flow control window is opened.
+        """
+        f: asyncio.Future[None] = asyncio.Future()
+        self.flow_control_futures[stream_id] = f
+        await f
+
+    def window_updated(self, stream_id, delta):
+        """
+        A window update frame was received. Unblock some number of flow control
+        Futures.
+        """
+        if stream_id and stream_id in self.flow_control_futures:
+            f = self.flow_control_futures.pop(stream_id)
+            f.set_result(delta)
+        elif not stream_id:
+            for f in self.flow_control_futures.values():
+                f.set_result(delta)
+
+            self.flow_control_futures = {}
+
+
+@pytest_asyncio.fixture(scope="function")
+async def http2_echoserver() -> AsyncIterable[H2Server]:
+    loop = asyncio.get_event_loop()
+    serve = await loop.create_server(H2Protocol, "127.0.0.1", 0)
+    (host, port) = serve.sockets[0].getsockname()
+
+    asyncio.create_task(serve.wait_closed())
+
+    server = H2Server(host, port)
+    yield server
+
+    serve.close()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a8ec144fe9..1b9bc873f4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -35,11 +35,13 @@ import toml
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from jwcrypto import jwk
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
+from pytest_httpserver import HTTPServer
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
@@ -53,6 +55,7 @@ from fixtures.common_types import (
     TimelineId,
 )
 from fixtures.endpoint.http import EndpointHttpClient
+from fixtures.h2server import H2Server
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.neon_cli import NeonLocalCli, Pagectl
@@ -3080,6 +3083,31 @@ class PSQL:
         )
 
 
+def generate_proxy_tls_certs(common_name: str, key_path: Path, crt_path: Path):
+    if not key_path.exists():
+        r = subprocess.run(
+            [
+                "openssl",
+                "req",
+                "-new",
+                "-x509",
+                "-days",
+                "365",
+                "-nodes",
+                "-text",
+                "-out",
+                str(crt_path),
+                "-keyout",
+                str(key_path),
+                "-subj",
+                f"/CN={common_name}",
+                "-addext",
+                f"subjectAltName = DNS:{common_name}",
+            ]
+        )
+        assert r.returncode == 0
+
+
 class NeonProxy(PgProtocol):
     link_auth_uri: str = "http://dummy-uri"
 
@@ -3178,29 +3206,7 @@ class NeonProxy(PgProtocol):
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-
-        if not key_path.exists():
-            r = subprocess.run(
-                [
-                    "openssl",
-                    "req",
-                    "-new",
-                    "-x509",
-                    "-days",
-                    "365",
-                    "-nodes",
-                    "-text",
-                    "-out",
-                    str(crt_path),
-                    "-keyout",
-                    str(key_path),
-                    "-subj",
-                    "/CN=*.localtest.me",
-                    "-addext",
-                    "subjectAltName = DNS:*.localtest.me",
-                ]
-            )
-            assert r.returncode == 0
+        generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
@@ -3380,6 +3386,125 @@ class NeonProxy(PgProtocol):
         assert out == "ok"
 
 
+class NeonAuthBroker:
+    class ControlPlane:
+        def __init__(self, endpoint: str):
+            self.endpoint = endpoint
+
+        def extra_args(self) -> list[str]:
+            args = [
+                *["--auth-backend", "console"],
+                *["--auth-endpoint", self.endpoint],
+            ]
+            return args
+
+    def __init__(
+        self,
+        neon_binpath: Path,
+        test_output_dir: Path,
+        http_port: int,
+        mgmt_port: int,
+        external_http_port: int,
+        auth_backend: NeonAuthBroker.ControlPlane,
+    ):
+        self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
+        self.host = "127.0.0.1"
+        self.http_port = http_port
+        self.external_http_port = external_http_port
+        self.neon_binpath = neon_binpath
+        self.test_output_dir = test_output_dir
+        self.mgmt_port = mgmt_port
+        self.auth_backend = auth_backend
+        self.http_timeout_seconds = 15
+        self._popen: Optional[subprocess.Popen[bytes]] = None
+
+    def start(self) -> NeonAuthBroker:
+        assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+        generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
+
+        args = [
+            str(self.neon_binpath / "proxy"),
+            *["--http", f"{self.host}:{self.http_port}"],
+            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
+            *["--sql-over-http-pool-opt-in", "false"],
+            *["--is-auth-broker", "true"],
+            *self.auth_backend.extra_args(),
+        ]
+
+        logfile = open(self.test_output_dir / "proxy.log", "w")
+        self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
+        self._wait_until_ready()
+        return self
+
+    # Sends SIGTERM to the proxy if it has been started
+    def terminate(self):
+        if self._popen:
+            self._popen.terminate()
+
+    # Waits for proxy to exit if it has been opened with a default timeout of
+    # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
+    def wait_for_exit(self, timeout=2):
+        if self._popen:
+            self._popen.wait(timeout=timeout)
+
+    @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
+    def _wait_until_ready(self):
+        assert (
+            self._popen and self._popen.poll() is None
+        ), "Proxy exited unexpectedly. Check test log."
+        requests.get(f"http://{self.host}:{self.http_port}/v1/status")
+
+    async def query(self, query, args, **kwargs):
+        user = kwargs["user"]
+        token = kwargs["token"]
+        expected_code = kwargs.get("expected_code")
+
+        log.info(f"Executing http query: {query}")
+
+        connstr = f"postgresql://{user}@{self.domain}/postgres"
+        async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
+            response = await client.post(
+                f"https://{self.domain}:{self.external_http_port}/sql",
+                json={"query": query, "params": args},
+                headers={
+                    "Neon-Connection-String": connstr,
+                    "Authorization": f"Bearer {token}",
+                },
+            )
+
+            if expected_code is not None:
+                assert response.status_code == expected_code, f"response: {response.json()}"
+            return response.json()
+
+    def get_metrics(self) -> str:
+        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
+        return request_result.text
+
+    def __enter__(self) -> NeonAuthBroker:
+        return self
+
+    def __exit__(
+        self,
+        _exc_type: Optional[type[BaseException]],
+        _exc_value: Optional[BaseException],
+        _traceback: Optional[TracebackType],
+    ):
+        if self._popen is not None:
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate proxy; killing")
+                self._popen.kill()
+
+
 @pytest.fixture(scope="function")
 def link_proxy(
     port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
@@ -3444,6 +3569,74 @@ def static_proxy(
         yield proxy
 
 
+@pytest.fixture(scope="function")
+def neon_authorize_jwk() -> jwk.JWK:
+    kid = str(uuid.uuid4())
+    key = jwk.JWK.generate(kty="RSA", size=2048, alg="RS256", use="sig", kid=kid)
+    assert isinstance(key, jwk.JWK)
+    return key
+
+
+@pytest.fixture(scope="function")
+def static_auth_broker(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
+    httpserver: HTTPServer,
+    neon_authorize_jwk: jwk.JWK,
+    http2_echoserver: H2Server,
+) -> Iterable[NeonAuthBroker]:
+    """Neon Auth Broker that routes to a mocked local_proxy and a mocked cplane HTTP API."""
+
+    local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"
+
+    # return local_proxy addr on ProxyWakeCompute.
+    httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
+        {
+            "address": local_proxy_addr,
+            "aux": {
+                "endpoint_id": "ep-foo-bar-1234",
+                "branch_id": "br-foo-bar",
+                "project_id": "foo-bar",
+            },
+        }
+    )
+
+    # return jwks mock addr on GetEndpointJwks
+    httpserver.expect_request(re.compile("^/cplane/endpoints/.+/jwks$")).respond_with_json(
+        {
+            "jwks": [
+                {
+                    "id": "foo",
+                    "jwks_url": httpserver.url_for("/authorize/jwks.json"),
+                    "provider_name": "test",
+                    "jwt_audience": None,
+                    "role_names": ["anonymous", "authenticated"],
+                }
+            ]
+        }
+    )
+
+    # return static fixture jwks.
+    jwk = neon_authorize_jwk.export_public(as_dict=True)
+    httpserver.expect_request("/authorize/jwks.json").respond_with_json({"keys": [jwk]})
+
+    mgmt_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+
+    with NeonAuthBroker(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
 class Endpoint(PgProtocol, LogUtils):
     """An object representing a Postgres compute endpoint managed by the control plane."""
 
diff --git a/test_runner/regress/test_auth_broker.py b/test_runner/regress/test_auth_broker.py
new file mode 100644
index 0000000000..11dc7d56b5
--- /dev/null
+++ b/test_runner/regress/test_auth_broker.py
@@ -0,0 +1,37 @@
+import json
+
+import pytest
+from fixtures.neon_fixtures import NeonAuthBroker
+from jwcrypto import jwk, jwt
+
+
+@pytest.mark.asyncio
+async def test_auth_broker_happy(
+    static_auth_broker: NeonAuthBroker,
+    neon_authorize_jwk: jwk.JWK,
+):
+    """
+    Signs a JWT and uses it to authorize a query to local_proxy.
+    """
+
+    token = jwt.JWT(
+        header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, claims={"sub": "user1"}
+    )
+    token.make_signed_token(neon_authorize_jwk)
+    res = await static_auth_broker.query("foo", ["arg1"], user="anonymous", token=token.serialize())
+
+    # local proxy mock just echos back the request
+    # check that we forward the correct data
+
+    assert (
+        res["headers"]["authorization"] == f"Bearer {token.serialize()}"
+    ), "JWT should be forwarded"
+
+    assert (
+        "anonymous" in res["headers"]["neon-connection-string"]
+    ), "conn string should be forwarded"
+
+    assert json.loads(res["body"]) == {
+        "query": "foo",
+        "params": ["arg1"],
+    }, "Query body should be forwarded"
diff --git a/test_runner/stubs/h2/README.md b/test_runner/stubs/h2/README.md
new file mode 100644
index 0000000000..cdf181ff80
--- /dev/null
+++ b/test_runner/stubs/h2/README.md
@@ -0,0 +1 @@
+generated via `poetry run stubgen -p h2 -o test_runner/stubs`
diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi
new file mode 100644
index 0000000000..710005db69
--- /dev/null
+++ b/test_runner/stubs/h2/config.pyi
@@ -0,0 +1,42 @@
+from _typeshed import Incomplete
+
+class _BooleanConfigOption:
+    name: Incomplete
+    attr_name: Incomplete
+    def __init__(self, name) -> None: ...
+    def __get__(self, instance, owner): ...
+    def __set__(self, instance, value) -> None: ...
+
+class DummyLogger:
+    def __init__(self, *vargs) -> None: ...
+    def debug(self, *vargs, **kwargs) -> None: ...
+    def trace(self, *vargs, **kwargs) -> None: ...
+
+class OutputLogger:
+    file: Incomplete
+    trace_level: Incomplete
+    def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ...
+    def debug(self, fmtstr, *args) -> None: ...
+    def trace(self, fmtstr, *args) -> None: ...
+
+class H2Configuration:
+    client_side: Incomplete
+    validate_outbound_headers: Incomplete
+    normalize_outbound_headers: Incomplete
+    validate_inbound_headers: Incomplete
+    normalize_inbound_headers: Incomplete
+    logger: Incomplete
+    def __init__(
+        self,
+        client_side: bool = ...,
+        header_encoding: Incomplete | None = ...,
+        validate_outbound_headers: bool = ...,
+        normalize_outbound_headers: bool = ...,
+        validate_inbound_headers: bool = ...,
+        normalize_inbound_headers: bool = ...,
+        logger: Incomplete | None = ...,
+    ) -> None: ...
+    @property
+    def header_encoding(self): ...
+    @header_encoding.setter
+    def header_encoding(self, value) -> None: ...
diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi
new file mode 100644
index 0000000000..04be18ca74
--- /dev/null
+++ b/test_runner/stubs/h2/connection.pyi
@@ -0,0 +1,142 @@
+from enum import Enum, IntEnum
+
+from _typeshed import Incomplete
+
+from .config import H2Configuration as H2Configuration
+from .errors import ErrorCodes as ErrorCodes
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable
+from .events import ConnectionTerminated as ConnectionTerminated
+from .events import PingAckReceived as PingAckReceived
+from .events import PingReceived as PingReceived
+from .events import PriorityUpdated as PriorityUpdated
+from .events import RemoteSettingsChanged as RemoteSettingsChanged
+from .events import SettingsAcknowledged as SettingsAcknowledged
+from .events import UnknownFrameReceived as UnknownFrameReceived
+from .events import WindowUpdated as WindowUpdated
+from .exceptions import DenialOfServiceError as DenialOfServiceError
+from .exceptions import FlowControlError as FlowControlError
+from .exceptions import FrameTooLargeError as FrameTooLargeError
+from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError
+from .exceptions import NoSuchStreamError as NoSuchStreamError
+from .exceptions import ProtocolError as ProtocolError
+from .exceptions import RFC1122Error as RFC1122Error
+from .exceptions import StreamClosedError as StreamClosedError
+from .exceptions import StreamIDTooLowError as StreamIDTooLowError
+from .exceptions import TooManyStreamsError as TooManyStreamsError
+from .frame_buffer import FrameBuffer as FrameBuffer
+from .settings import SettingCodes as SettingCodes
+from .settings import Settings as Settings
+from .stream import H2Stream as H2Stream
+from .stream import StreamClosedBy as StreamClosedBy
+from .utilities import guard_increment_window as guard_increment_window
+from .windows import WindowManager as WindowManager
+
+class ConnectionState(Enum):
+    IDLE: int
+    CLIENT_OPEN: int
+    SERVER_OPEN: int
+    CLOSED: int
+
+class ConnectionInputs(Enum):
+    SEND_HEADERS: int
+    SEND_PUSH_PROMISE: int
+    SEND_DATA: int
+    SEND_GOAWAY: int
+    SEND_WINDOW_UPDATE: int
+    SEND_PING: int
+    SEND_SETTINGS: int
+    SEND_RST_STREAM: int
+    SEND_PRIORITY: int
+    RECV_HEADERS: int
+    RECV_PUSH_PROMISE: int
+    RECV_DATA: int
+    RECV_GOAWAY: int
+    RECV_WINDOW_UPDATE: int
+    RECV_PING: int
+    RECV_SETTINGS: int
+    RECV_RST_STREAM: int
+    RECV_PRIORITY: int
+    SEND_ALTERNATIVE_SERVICE: int
+    RECV_ALTERNATIVE_SERVICE: int
+
+class AllowedStreamIDs(IntEnum):
+    EVEN: int
+    ODD: int
+
+class H2ConnectionStateMachine:
+    state: Incomplete
+    def __init__(self) -> None: ...
+    def process_input(self, input_): ...
+
+class H2Connection:
+    DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int
+    DEFAULT_MAX_INBOUND_FRAME_SIZE: Incomplete
+    HIGHEST_ALLOWED_STREAM_ID: Incomplete
+    MAX_WINDOW_INCREMENT: Incomplete
+    DEFAULT_MAX_HEADER_LIST_SIZE: Incomplete
+    MAX_CLOSED_STREAMS: Incomplete
+    state_machine: Incomplete
+    streams: Incomplete
+    highest_inbound_stream_id: int
+    highest_outbound_stream_id: int
+    encoder: Incomplete
+    decoder: Incomplete
+    config: Incomplete
+    local_settings: Incomplete
+    remote_settings: Incomplete
+    outbound_flow_control_window: Incomplete
+    max_outbound_frame_size: Incomplete
+    max_inbound_frame_size: Incomplete
+    incoming_buffer: Incomplete
+    def __init__(self, config: Incomplete | None = ...) -> None: ...
+    @property
+    def open_outbound_streams(self): ...
+    @property
+    def open_inbound_streams(self): ...
+    @property
+    def inbound_flow_control_window(self): ...
+    def initiate_connection(self) -> None: ...
+    def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ...
+    def get_next_available_stream_id(self): ...
+    def send_headers(
+        self,
+        stream_id,
+        headers,
+        end_stream: bool = ...,
+        priority_weight: Incomplete | None = ...,
+        priority_depends_on: Incomplete | None = ...,
+        priority_exclusive: Incomplete | None = ...,
+    ) -> None: ...
+    def send_data(
+        self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ...
+    ) -> None: ...
+    def end_stream(self, stream_id) -> None: ...
+    def increment_flow_control_window(
+        self, increment, stream_id: Incomplete | None = ...
+    ) -> None: ...
+    def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ...
+    def ping(self, opaque_data) -> None: ...
+    def reset_stream(self, stream_id, error_code: int = ...) -> None: ...
+    def close_connection(
+        self,
+        error_code: int = ...,
+        additional_data: Incomplete | None = ...,
+        last_stream_id: Incomplete | None = ...,
+    ) -> None: ...
+    def update_settings(self, new_settings) -> None: ...
+    def advertise_alternative_service(
+        self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ...
+    ) -> None: ...
+    def prioritize(
+        self,
+        stream_id,
+        weight: Incomplete | None = ...,
+        depends_on: Incomplete | None = ...,
+        exclusive: Incomplete | None = ...,
+    ) -> None: ...
+    def local_flow_control_window(self, stream_id): ...
+    def remote_flow_control_window(self, stream_id): ...
+    def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ...
+    def data_to_send(self, amount: Incomplete | None = ...): ...
+    def clear_outbound_data_buffer(self) -> None: ...
+    def receive_data(self, data): ...
diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi
new file mode 100644
index 0000000000..b70c632f8c
--- /dev/null
+++ b/test_runner/stubs/h2/errors.pyi
@@ -0,0 +1,17 @@
+import enum
+
+class ErrorCodes(enum.IntEnum):
+    NO_ERROR: int
+    PROTOCOL_ERROR: int
+    INTERNAL_ERROR: int
+    FLOW_CONTROL_ERROR: int
+    SETTINGS_TIMEOUT: int
+    STREAM_CLOSED: int
+    FRAME_SIZE_ERROR: int
+    REFUSED_STREAM: int
+    CANCEL: int
+    COMPRESSION_ERROR: int
+    CONNECT_ERROR: int
+    ENHANCE_YOUR_CALM: int
+    INADEQUATE_SECURITY: int
+    HTTP_1_1_REQUIRED: int
diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi
new file mode 100644
index 0000000000..75d0a9e53b
--- /dev/null
+++ b/test_runner/stubs/h2/events.pyi
@@ -0,0 +1,106 @@
+from _typeshed import Incomplete
+
+from .settings import ChangedSetting as ChangedSetting
+
+class Event: ...
+
+class RequestReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class ResponseReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class TrailersReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class _HeadersSent(Event): ...
+class _ResponseSent(_HeadersSent): ...
+class _RequestSent(_HeadersSent): ...
+class _TrailersSent(_HeadersSent): ...
+class _PushedRequestSent(_HeadersSent): ...
+
+class InformationalResponseReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class DataReceived(Event):
+    stream_id: Incomplete
+    data: Incomplete
+    flow_controlled_length: Incomplete
+    stream_ended: Incomplete
+    def __init__(self) -> None: ...
+
+class WindowUpdated(Event):
+    stream_id: Incomplete
+    delta: Incomplete
+    def __init__(self) -> None: ...
+
+class RemoteSettingsChanged(Event):
+    changed_settings: Incomplete
+    def __init__(self) -> None: ...
+    @classmethod
+    def from_settings(cls, old_settings, new_settings): ...
+
+class PingReceived(Event):
+    ping_data: Incomplete
+    def __init__(self) -> None: ...
+
+class PingAckReceived(Event):
+    ping_data: Incomplete
+    def __init__(self) -> None: ...
+
+class StreamEnded(Event):
+    stream_id: Incomplete
+    def __init__(self) -> None: ...
+
+class StreamReset(Event):
+    stream_id: Incomplete
+    error_code: Incomplete
+    remote_reset: bool
+    def __init__(self) -> None: ...
+
+class PushedStreamReceived(Event):
+    pushed_stream_id: Incomplete
+    parent_stream_id: Incomplete
+    headers: Incomplete
+    def __init__(self) -> None: ...
+
+class SettingsAcknowledged(Event):
+    changed_settings: Incomplete
+    def __init__(self) -> None: ...
+
+class PriorityUpdated(Event):
+    stream_id: Incomplete
+    weight: Incomplete
+    depends_on: Incomplete
+    exclusive: Incomplete
+    def __init__(self) -> None: ...
+
+class ConnectionTerminated(Event):
+    error_code: Incomplete
+    last_stream_id: Incomplete
+    additional_data: Incomplete
+    def __init__(self) -> None: ...
+
+class AlternativeServiceAvailable(Event):
+    origin: Incomplete
+    field_value: Incomplete
+    def __init__(self) -> None: ...
+
+class UnknownFrameReceived(Event):
+    frame: Incomplete
+    def __init__(self) -> None: ...
diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi
new file mode 100644
index 0000000000..82019d5ec1
--- /dev/null
+++ b/test_runner/stubs/h2/exceptions.pyi
@@ -0,0 +1,48 @@
+from _typeshed import Incomplete
+
+class H2Error(Exception): ...
+
+class ProtocolError(H2Error):
+    error_code: Incomplete
+
+class FrameTooLargeError(ProtocolError):
+    error_code: Incomplete
+
+class FrameDataMissingError(ProtocolError):
+    error_code: Incomplete
+
+class TooManyStreamsError(ProtocolError): ...
+
+class FlowControlError(ProtocolError):
+    error_code: Incomplete
+
+class StreamIDTooLowError(ProtocolError):
+    stream_id: Incomplete
+    max_stream_id: Incomplete
+    def __init__(self, stream_id, max_stream_id) -> None: ...
+
+class NoAvailableStreamIDError(ProtocolError): ...
+
+class NoSuchStreamError(ProtocolError):
+    stream_id: Incomplete
+    def __init__(self, stream_id) -> None: ...
+
+class StreamClosedError(NoSuchStreamError):
+    stream_id: Incomplete
+    error_code: Incomplete
+    def __init__(self, stream_id) -> None: ...
+
+class InvalidSettingsValueError(ProtocolError, ValueError):
+    error_code: Incomplete
+    def __init__(self, msg, error_code) -> None: ...
+
+class InvalidBodyLengthError(ProtocolError):
+    expected_length: Incomplete
+    actual_length: Incomplete
+    def __init__(self, expected, actual) -> None: ...
+
+class UnsupportedFrameError(ProtocolError): ...
+class RFC1122Error(H2Error): ...
+
+class DenialOfServiceError(ProtocolError):
+    error_code: Incomplete
diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi
new file mode 100644
index 0000000000..f47adab704
--- /dev/null
+++ b/test_runner/stubs/h2/frame_buffer.pyi
@@ -0,0 +1,19 @@
+from .exceptions import (
+    FrameDataMissingError as FrameDataMissingError,
+)
+from .exceptions import (
+    FrameTooLargeError as FrameTooLargeError,
+)
+from .exceptions import (
+    ProtocolError as ProtocolError,
+)
+
+CONTINUATION_BACKLOG: int
+
+class FrameBuffer:
+    data: bytes
+    max_frame_size: int
+    def __init__(self, server: bool = ...) -> None: ...
+    def add_data(self, data) -> None: ...
+    def __iter__(self): ...
+    def __next__(self): ...
diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi
new file mode 100644
index 0000000000..a352abe53e
--- /dev/null
+++ b/test_runner/stubs/h2/settings.pyi
@@ -0,0 +1,61 @@
+import enum
+from collections.abc import MutableMapping
+from typing import Any
+
+from _typeshed import Incomplete
+from h2.errors import ErrorCodes as ErrorCodes
+from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError
+
+class SettingCodes(enum.IntEnum):
+    HEADER_TABLE_SIZE: Incomplete
+    ENABLE_PUSH: Incomplete
+    MAX_CONCURRENT_STREAMS: Incomplete
+    INITIAL_WINDOW_SIZE: Incomplete
+    MAX_FRAME_SIZE: Incomplete
+    MAX_HEADER_LIST_SIZE: Incomplete
+    ENABLE_CONNECT_PROTOCOL: Incomplete
+
+class ChangedSetting:
+    setting: Incomplete
+    original_value: Incomplete
+    new_value: Incomplete
+    def __init__(self, setting, original_value, new_value) -> None: ...
+
+class Settings(MutableMapping[str, Any]):
+    def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ...
+    def acknowledge(self): ...
+    @property
+    def header_table_size(self): ...
+    @header_table_size.setter
+    def header_table_size(self, value) -> None: ...
+    @property
+    def enable_push(self): ...
+    @enable_push.setter
+    def enable_push(self, value) -> None: ...
+    @property
+    def initial_window_size(self): ...
+    @initial_window_size.setter
+    def initial_window_size(self, value) -> None: ...
+    @property
+    def max_frame_size(self): ...
+    @max_frame_size.setter
+    def max_frame_size(self, value) -> None: ...
+    @property
+    def max_concurrent_streams(self): ...
+    @max_concurrent_streams.setter
+    def max_concurrent_streams(self, value) -> None: ...
+    @property
+    def max_header_list_size(self): ...
+    @max_header_list_size.setter
+    def max_header_list_size(self, value) -> None: ...
+    @property
+    def enable_connect_protocol(self): ...
+    @enable_connect_protocol.setter
+    def enable_connect_protocol(self, value) -> None: ...
+    def __getitem__(self, key): ...
+    def __setitem__(self, key, value) -> None: ...
+    def __delitem__(self, key) -> None: ...
+    def __iter__(self): ...
+    def __len__(self) -> int: ...
+    def __eq__(self, other): ...
+    def __ne__(self, other): ...
diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi
new file mode 100644
index 0000000000..d52ab8e72b
--- /dev/null
+++ b/test_runner/stubs/h2/stream.pyi
@@ -0,0 +1,184 @@
+from enum import Enum, IntEnum
+
+from _typeshed import Incomplete
+
+from .errors import ErrorCodes as ErrorCodes
+from .events import (
+    AlternativeServiceAvailable as AlternativeServiceAvailable,
+)
+from .events import (
+    DataReceived as DataReceived,
+)
+from .events import (
+    InformationalResponseReceived as InformationalResponseReceived,
+)
+from .events import (
+    PushedStreamReceived as PushedStreamReceived,
+)
+from .events import (
+    RequestReceived as RequestReceived,
+)
+from .events import (
+    ResponseReceived as ResponseReceived,
+)
+from .events import (
+    StreamEnded as StreamEnded,
+)
+from .events import (
+    StreamReset as StreamReset,
+)
+from .events import (
+    TrailersReceived as TrailersReceived,
+)
+from .events import (
+    WindowUpdated as WindowUpdated,
+)
+from .exceptions import (
+    FlowControlError as FlowControlError,
+)
+from .exceptions import (
+    InvalidBodyLengthError as InvalidBodyLengthError,
+)
+from .exceptions import (
+    ProtocolError as ProtocolError,
+)
+from .exceptions import (
+    StreamClosedError as StreamClosedError,
+)
+from .utilities import (
+    HeaderValidationFlags as HeaderValidationFlags,
+)
+from .utilities import (
+    authority_from_headers as authority_from_headers,
+)
+from .utilities import (
+    extract_method_header as extract_method_header,
+)
+from .utilities import (
+    guard_increment_window as guard_increment_window,
+)
+from .utilities import (
+    is_informational_response as is_informational_response,
+)
+from .utilities import (
+    normalize_inbound_headers as normalize_inbound_headers,
+)
+from .utilities import (
+    normalize_outbound_headers as normalize_outbound_headers,
+)
+from .utilities import (
+    validate_headers as validate_headers,
+)
+from .utilities import (
+    validate_outbound_headers as validate_outbound_headers,
+)
+from .windows import WindowManager as WindowManager
+
+class StreamState(IntEnum):
+    IDLE: int
+    RESERVED_REMOTE: int
+    RESERVED_LOCAL: int
+    OPEN: int
+    HALF_CLOSED_REMOTE: int
+    HALF_CLOSED_LOCAL: int
+    CLOSED: int
+
+class StreamInputs(Enum):
+    SEND_HEADERS: int
+    SEND_PUSH_PROMISE: int
+    SEND_RST_STREAM: int
+    SEND_DATA: int
+    SEND_WINDOW_UPDATE: int
+    SEND_END_STREAM: int
+    RECV_HEADERS: int
+    RECV_PUSH_PROMISE: int
+    RECV_RST_STREAM: int
+    RECV_DATA: int
+    RECV_WINDOW_UPDATE: int
+    RECV_END_STREAM: int
+    RECV_CONTINUATION: int
+    SEND_INFORMATIONAL_HEADERS: int
+    RECV_INFORMATIONAL_HEADERS: int
+    SEND_ALTERNATIVE_SERVICE: int
+    RECV_ALTERNATIVE_SERVICE: int
+    UPGRADE_CLIENT: int
+    UPGRADE_SERVER: int
+
+class StreamClosedBy(Enum):
+    SEND_END_STREAM: int
+    RECV_END_STREAM: int
+    SEND_RST_STREAM: int
+    RECV_RST_STREAM: int
+
+STREAM_OPEN: Incomplete
+
+class H2StreamStateMachine:
+    state: Incomplete
+    stream_id: Incomplete
+    client: Incomplete
+    headers_sent: Incomplete
+    trailers_sent: Incomplete
+    headers_received: Incomplete
+    trailers_received: Incomplete
+    stream_closed_by: Incomplete
+    def __init__(self, stream_id) -> None: ...
+    def process_input(self, input_): ...
+    def request_sent(self, previous_state): ...
+    def response_sent(self, previous_state): ...
+    def request_received(self, previous_state): ...
+    def response_received(self, previous_state): ...
+    def data_received(self, previous_state): ...
+    def window_updated(self, previous_state): ...
+    def stream_half_closed(self, previous_state): ...
+    def stream_ended(self, previous_state): ...
+    def stream_reset(self, previous_state): ...
+    def send_new_pushed_stream(self, previous_state): ...
+    def recv_new_pushed_stream(self, previous_state): ...
+    def send_push_promise(self, previous_state): ...
+    def recv_push_promise(self, previous_state): ...
+    def send_end_stream(self, previous_state) -> None: ...
+    def send_reset_stream(self, previous_state) -> None: ...
+    def reset_stream_on_error(self, previous_state) -> None: ...
+    def recv_on_closed_stream(self, previous_state) -> None: ...
+    def send_on_closed_stream(self, previous_state) -> None: ...
+    def recv_push_on_closed_stream(self, previous_state) -> None: ...
+    def send_push_on_closed_stream(self, previous_state) -> None: ...
+    def send_informational_response(self, previous_state): ...
+    def recv_informational_response(self, previous_state): ...
+    def recv_alt_svc(self, previous_state): ...
+    def send_alt_svc(self, previous_state) -> None: ...
+
+class H2Stream:
+    state_machine: Incomplete
+    stream_id: Incomplete
+    max_outbound_frame_size: Incomplete
+    request_method: Incomplete
+    outbound_flow_control_window: Incomplete
+    config: Incomplete
+    def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ...
+    @property
+    def inbound_flow_control_window(self): ...
+    @property
+    def open(self): ...
+    @property
+    def closed(self): ...
+    @property
+    def closed_by(self): ...
+    def upgrade(self, client_side) -> None: ...
+    def send_headers(self, headers, encoder, end_stream: bool = ...): ...
+    def push_stream_in_band(self, related_stream_id, headers, encoder): ...
+    def locally_pushed(self): ...
+    def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ...
+    def end_stream(self): ...
+    def advertise_alternative_service(self, field_value): ...
+    def increase_flow_control_window(self, increment): ...
+    def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ...
+    def remotely_pushed(self, pushed_headers): ...
+    def receive_headers(self, headers, end_stream, header_encoding): ...
+    def receive_data(self, data, end_stream, flow_control_len): ...
+    def receive_window_update(self, increment): ...
+    def receive_continuation(self) -> None: ...
+    def receive_alt_svc(self, frame): ...
+    def reset_stream(self, error_code: int = ...): ...
+    def stream_reset(self, frame): ...
+    def acknowledge_received_data(self, acknowledged_size): ...
diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi
new file mode 100644
index 0000000000..e0a8d55d1d
--- /dev/null
+++ b/test_runner/stubs/h2/utilities.pyi
@@ -0,0 +1,25 @@
+from typing import NamedTuple
+
+from _typeshed import Incomplete
+
+from .exceptions import FlowControlError as FlowControlError
+from .exceptions import ProtocolError as ProtocolError
+
+UPPER_RE: Incomplete
+CONNECTION_HEADERS: Incomplete
+
+def extract_method_header(headers): ...
+def is_informational_response(headers): ...
+def guard_increment_window(current, increment): ...
+def authority_from_headers(headers): ...
+
+class HeaderValidationFlags(NamedTuple):
+    is_client: Incomplete
+    is_trailer: Incomplete
+    is_response_header: Incomplete
+    is_push_promise: Incomplete
+
+def validate_headers(headers, hdr_validation_flags): ...
+def normalize_outbound_headers(headers, hdr_validation_flags): ...
+def normalize_inbound_headers(headers, hdr_validation_flags): ...
+def validate_outbound_headers(headers, hdr_validation_flags): ...
diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi
new file mode 100644
index 0000000000..7dc78e431c
--- /dev/null
+++ b/test_runner/stubs/h2/windows.pyi
@@ -0,0 +1,13 @@
+from _typeshed import Incomplete
+
+from .exceptions import FlowControlError as FlowControlError
+
+LARGEST_FLOW_CONTROL_WINDOW: Incomplete
+
+class WindowManager:
+    max_window_size: Incomplete
+    current_window_size: Incomplete
+    def __init__(self, max_window_size) -> None: ...
+    def window_consumed(self, size) -> None: ...
+    def window_opened(self, size) -> None: ...
+    def process_bytes(self, size): ...

From 4ef74215e1174186c7ab8cdb41d98cb9a327d07d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 13:00:03 +0000
Subject: [PATCH 23/27] pageserver: refactor generation-aware loading code into
 generic (#9545)

## Problem

Indices used to be the only kind of object where we had to search across
generations to find the most recent one. As of
https://github.com/neondatabase/neon/issues/9543, manifests will need
the same treatment.

## Summary of changes

- Refactor download_index_part to a generic download_generation_object
function, which will be usable for downloading manifest objects as well.
---
 .../tenant/remote_timeline_client/download.rs | 139 ++++++++++++------
 1 file changed, 91 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 95f8f026d4..8679c68a27 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -403,59 +403,79 @@ async fn do_download_index_part(
     Ok((index_part, index_generation, index_part_mtime))
 }
 
-/// index_part.json objects are suffixed with a generation number, so we cannot
-/// directly GET the latest index part without doing some probing.
+/// Metadata objects are "generationed", meaning that they include a generation suffix.  This
+/// function downloads the object with the highest generation <= `my_generation`.
 ///
-/// In this function we probe for the most recent index in a generation <= our current generation.
-/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+/// Data objects (layer files) also include a generation in their path, but there is no equivalent
+/// search process, because their reference from an index includes the generation.
+///
+/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
+/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
+/// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
+/// to listing objects.
+///
+/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `what`: for logging, what object are we downloading
+/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
+/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
+///                  `cancel`` has fired.  This function does not do its own retries of GET operations, and relies
+///                  on the function passed in to do so.
+/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
+#[allow(clippy::too_many_arguments)]
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(crate) async fn download_index_part(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
+pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
+    storage: &'a GenericRemoteStorage,
+    tenant_shard_id: &'a TenantShardId,
+    timeline_id: &'a TimelineId,
     my_generation: Generation,
-    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    what: &str,
+    prefix: RemotePath,
+    do_download: DF,
+    parse_path: PF,
+    cancel: &'a CancellationToken,
+) -> Result<(T, Generation, SystemTime), DownloadError>
+where
+    DF: Fn(
+        &'a GenericRemoteStorage,
+        &'a TenantShardId,
+        &'a TimelineId,
+        Generation,
+        &'a CancellationToken,
+    ) -> DFF,
+    DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
+    PF: Fn(RemotePath) -> Option<Generation>,
+    T: 'static,
+{
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {
         // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(
-            storage,
-            tenant_shard_id,
-            timeline_id,
-            my_generation,
-            cancel,
-        )
-        .await;
+        return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     }
 
-    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
-    // index in our generation.
+    // Stale case: If we were intentionally attached in a stale generation, the remote object may already
+    // exist in our generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
+    let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     match res {
-        Ok(index_part) => {
-            tracing::debug!(
-                "Found index_part from current generation (this is a stale attachment)"
-            );
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from current generation (this is a stale attachment)");
+            return Ok(decoded);
         }
         Err(DownloadError::NotFound) => {}
         Err(e) => return Err(e),
     };
 
-    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
-    // and index part.  We may safely start from this index without doing a listing, because:
+    // Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
+    // we are seeking in that generation.  We may safely start from this index without doing a listing, because:
     //  - We checked for current generation case above
     //  - generations > my_generation are to be ignored
-    //  - any other indices that exist would have an older generation than `previous_gen`, and
-    //    we want to find the most recent index from a previous generation.
+    //  - any other objects that exist would have an older generation than `previous_gen`, and
+    //    we want to find the most recent object from a previous generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
+    let res = do_download(
         storage,
         tenant_shard_id,
         timeline_id,
@@ -464,14 +484,12 @@ pub(crate) async fn download_index_part(
     )
     .await;
     match res {
-        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation");
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from previous generation");
+            return Ok(decoded);
         }
         Err(DownloadError::NotFound) => {
-            tracing::debug!(
-                "No index_part found from previous generation, falling back to listing"
-            );
+            tracing::debug!("No {what} found from previous generation, falling back to listing");
         }
         Err(e) => {
             return Err(e);
@@ -481,12 +499,10 @@ pub(crate) async fn download_index_part(
     // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
     // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
     // to constructing a full index path with no generation, because the generation is a suffix.
-    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-
-    let indices = download_retry(
+    let paths = download_retry(
         || async {
             storage
-                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
                 .await
         },
         "list index_part files",
@@ -497,22 +513,22 @@ pub(crate) async fn download_index_part(
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
-    let max_previous_generation = indices
+    let max_previous_generation = paths
         .into_iter()
-        .filter_map(|o| parse_remote_index_path(o.key))
+        .filter_map(|o| parse_path(o.key))
         .filter(|g| g <= &my_generation)
         .max();
 
     match max_previous_generation {
         Some(g) => {
-            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
+            tracing::debug!("Found {what} in generation {g:?}");
+            do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
         }
         None => {
             // Migration from legacy pre-generation state: we have a generation but no prior
             // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
-            do_download_index_part(
+            tracing::debug!("No {what}* found");
+            do_download(
                 storage,
                 tenant_shard_id,
                 timeline_id,
@@ -524,6 +540,33 @@ pub(crate) async fn download_index_part(
     }
 }
 
+/// index_part.json objects are suffixed with a generation number, so we cannot
+/// directly GET the latest index part without doing some probing.
+///
+/// In this function we probe for the most recent index in a generation <= our current generation.
+/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+pub(crate) async fn download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        timeline_id,
+        my_generation,
+        "index_part",
+        index_prefix,
+        do_download_index_part,
+        parse_remote_index_path,
+        cancel,
+    )
+    .await
+}
+
 pub(crate) async fn download_initdb_tar_zst(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,

From 7a1331eee56a1590ef4fb73f07e70c013c7d9c84 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 13:54:48 +0000
Subject: [PATCH 24/27] pageserver: make concurrent offloaded timeline
 operations safe wrt manifest uploads (#9557)

## Problem

Uploads of the tenant manifest could race between different tasks,
resulting in unexpected results in remote storage.

Closes: https://github.com/neondatabase/neon/issues/9556

## Summary of changes

- Create a central function for uploads that takes a tokio::sync::Mutex
- Store the latest upload in that Mutex, so that when there is lots of
concurrency (e.g. archive 20 timelines at once) we can coalesce their
manifest writes somewhat.
---
 pageserver/src/tenant.rs                      | 100 +++++++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 .../tenant/remote_timeline_client/manifest.rs |   4 +-
 pageserver/src/tenant/timeline/delete.rs      |  43 +++-----
 pageserver/src/tenant/timeline/offload.rs     |  17 +--
 5 files changed, 94 insertions(+), 72 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7f8af67c2c..64e4eb46ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,6 +302,13 @@ pub struct Tenant {
     /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
 
+    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
+    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
+    /// each other (this could be optimized to coalesce writes if necessary).
+    ///
+    /// The contents of the Mutex are the last manifest we successfully uploaded
+    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
+
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
     // `timelines` mutex during all GC iteration
@@ -741,6 +748,24 @@ pub enum TimelineArchivalError {
     Other(anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum TenantManifestError {
+    #[error("Remote storage error: {0}")]
+    RemoteStorage(anyhow::Error),
+
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+impl From<TenantManifestError> for TimelineArchivalError {
+    fn from(e: TenantManifestError) -> Self {
+        match e {
+            TenantManifestError::RemoteStorage(e) => Self::Other(e),
+            TenantManifestError::Cancelled => Self::Cancelled,
+        }
+    }
+}
+
 impl Debug for TimelineArchivalError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -1526,18 +1551,7 @@ impl Tenant {
             offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
         }
         if !offloaded_timeline_ids.is_empty() {
-            let manifest = self.tenant_manifest();
-            // TODO: generation support
-            let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-            upload_tenant_manifest(
-                &self.remote_storage,
-                &self.tenant_shard_id,
-                generation,
-                &manifest,
-                &self.cancel,
-            )
-            .await
-            .map_err(TimelineArchivalError::Other)?;
+            self.store_tenant_manifest().await?;
         }
 
         // The local filesystem contents are a cache of what's in the remote IndexPart;
@@ -1918,18 +1932,7 @@ impl Tenant {
         };
 
         // Upload new list of offloaded timelines to S3
-        let manifest = self.tenant_manifest();
-        // TODO: generation support
-        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-        upload_tenant_manifest(
-            &self.remote_storage,
-            &self.tenant_shard_id,
-            generation,
-            &manifest,
-            &cancel,
-        )
-        .await
-        .map_err(TimelineArchivalError::Other)?;
+        self.store_tenant_manifest().await?;
 
         // Activate the timeline (if it makes sense)
         if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -3126,7 +3129,7 @@ impl Tenant {
             }
         }
 
-        let tenant_manifest = self.tenant_manifest();
+        let tenant_manifest = self.build_tenant_manifest();
         // TODO: generation support
         let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
         for child_shard in child_shards {
@@ -3321,7 +3324,8 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
-    pub(crate) fn tenant_manifest(&self) -> TenantManifest {
+    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
+    fn build_tenant_manifest(&self) -> TenantManifest {
         let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
 
         let mut timeline_manifests = timelines_offloaded
@@ -3529,6 +3533,7 @@ impl Tenant {
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             timelines_offloaded: Mutex::new(HashMap::new()),
+            tenant_manifest_upload: Default::default(),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
@@ -4708,6 +4713,49 @@ impl Tenant {
             .max()
             .unwrap_or(0)
     }
+
+    /// Serialize and write the latest TenantManifest to remote storage.
+    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
+        // Only one manifest write may be done at at time, and the contents of the manifest
+        // must be loaded while holding this lock. This makes it safe to call this function
+        // from anywhere without worrying about colliding updates.
+        let mut guard = tokio::select! {
+            g = self.tenant_manifest_upload.lock() => {
+                g
+            },
+            _ = self.cancel.cancelled() => {
+                return Err(TenantManifestError::Cancelled);
+            }
+        };
+
+        let manifest = self.build_tenant_manifest();
+        if Some(&manifest) == (*guard).as_ref() {
+            // Optimisation: skip uploads that don't change anything.
+            return Ok(());
+        }
+
+        upload_tenant_manifest(
+            &self.remote_storage,
+            &self.tenant_shard_id,
+            self.generation,
+            &manifest,
+            &self.cancel,
+        )
+        .await
+        .map_err(|e| {
+            if self.cancel.is_cancelled() {
+                TenantManifestError::Cancelled
+            } else {
+                TenantManifestError::RemoteStorage(e)
+            }
+        })?;
+
+        // Store the successfully uploaded manifest, so that future callers can avoid
+        // re-uploading the same thing.
+        *guard = Some(manifest);
+
+        Ok(())
+    }
 }
 
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1c72c7fff8..19e762b9fa 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -249,7 +249,7 @@ pub(crate) use download::{
     list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
-pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
+pub(crate) use upload::upload_initdb_dir;
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 7d92d45146..c4382cb648 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use utils::{id::TimelineId, lsn::Lsn};
 
 /// Tenant-shard scoped manifest
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct TenantManifest {
     /// Debugging aid describing the version of this manifest.
     /// Can also be used for distinguishing breaking changes later on.
@@ -23,7 +23,7 @@ pub struct TenantManifest {
 /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
 /// but the two datastructures serve different needs, this is for a persistent disk format
 /// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Serialize, Deserialize, Copy)]
+#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
 pub struct OffloadedTimelineManifest {
     pub timeline_id: TimelineId,
     /// Whether the timeline has a parent it has been branched off from or not
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 53b65da515..2c6161da15 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,10 +14,9 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
-        CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
+        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+        CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
+        TimelineOrOffloaded,
     },
 };
 
@@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
     Ok(())
 }
 
-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn upload_new_tenant_manifest(
-    tenant: &Tenant,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
-    // between the deletion of the index-part.json and reaching of this code.
-    // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
-    // However, we handle this case in tenant loading code so the next time we attach, the issue is
-    // resolved.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
-
-    Ok(())
-}
-
 /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -480,7 +453,15 @@ impl DeleteTimelineFlow {
 
         remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
 
-        upload_new_tenant_manifest(tenant, &guard).await?;
+        // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
+        // between the deletion of the index-part.json and reaching of this code.
+        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
+        // However, we handle this case in tenant loading code so the next time we attach, the issue is
+        // resolved.
+        tenant
+            .store_tenant_manifest()
+            .await
+            .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
 
         *guard = Self::Finished;
 
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 8e6eceb084..305c139b54 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
+use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
@@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline(
     // at the next restart attach it again.
     // For that to happen, we'd need to make the manifest reflect our *intended* state,
     // not our actual state of offloaded timelines.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
+    tenant
+        .store_tenant_manifest()
+        .await
+        .map_err(|e| anyhow::anyhow!(e))?;
 
     Ok(())
 }

From 793ad50b7d54c2c45c19e362b7bd9894a389d2cb Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Oct 2024 14:25:23 +0000
Subject: [PATCH 25/27] fix allow_unstable_extensions GUC - make it USERSET
 (#9563)

fix message wording
---
 pgxn/neon/unstable_extensions.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/unstable_extensions.c b/pgxn/neon/unstable_extensions.c
index a3445cb268..72de2871f4 100644
--- a/pgxn/neon/unstable_extensions.c
+++ b/pgxn/neon/unstable_extensions.c
@@ -65,8 +65,8 @@ CheckUnstableExtension(
 			{
 				ereport(ERROR,
 						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-						 errmsg("installing %s is currently prohibited", stmt->extname),
-						 errhint("Set neon.allow_unstable_extensions to true")));
+						 errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname),
+						 errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'")));
 			}
 			break;
 		}
@@ -110,13 +110,13 @@ InitUnstableExtensionsSupport(void)
 		NULL,
 		&allow_unstable_extensions,
 		false,
-		PGC_SUSET,
+		PGC_USERSET,
 		0,
 		NULL, NULL, NULL);
 
 	DefineCustomStringVariable(
 		"neon.unstable_extensions",
-		"Allow unstable extensions to be installed and used",
+		"List of unstable extensions",
 		NULL,
 		&unstable_extensions,
 		NULL,

From 57499640c5ab677796f61a5cd813fd6c881998e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:44:45 +0100
Subject: [PATCH 26/27] proxy: more granular http status codes for
 sql-over-http errors (#9549)

closes #9532
---
 proxy/src/serverless/error.rs         |  5 +++++
 proxy/src/serverless/mod.rs           |  1 +
 proxy/src/serverless/sql_over_http.rs | 24 +++++++++++++++++++++---
 test_runner/regress/test_proxy.py     |  2 +-
 4 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100644 proxy/src/serverless/error.rs

diff --git a/proxy/src/serverless/error.rs b/proxy/src/serverless/error.rs
new file mode 100644
index 0000000000..323c91baa5
--- /dev/null
+++ b/proxy/src/serverless/error.rs
@@ -0,0 +1,5 @@
+use http::StatusCode;
+
+pub trait HttpCodeError {
+    fn get_http_status_code(&self) -> StatusCode;
+}
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 8fb7a771d9..edbb0347d3 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -6,6 +6,7 @@ mod backend;
 pub mod cancel_set;
 mod conn_pool;
 mod conn_pool_lib;
+mod error;
 mod http_conn_pool;
 mod http_util;
 mod json;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1f3eec6d19..0713c27d65 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -28,6 +28,7 @@ use uuid::Uuid;
 use super::backend::{LocalProxyConnError, PoolingBackend};
 use super::conn_pool::{AuthData, ConnInfoWithAuth};
 use super::conn_pool_lib::{self, ConnInfo};
+use super::error::HttpCodeError;
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
 use super::local_conn_pool;
@@ -238,7 +239,6 @@ fn get_conn_info(
     Ok(ConnInfoWithAuth { conn_info, auth })
 }
 
-// TODO: return different http error codes
 pub(crate) async fn handle(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
@@ -319,9 +319,8 @@ pub(crate) async fn handle(
                 "forwarding error to user"
             );
 
-            // TODO: this shouldn't always be bad request.
             json_response(
-                StatusCode::BAD_REQUEST,
+                e.get_http_status_code(),
                 json!({
                     "message": message,
                     "code": code,
@@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError {
     }
 }
 
+impl HttpCodeError for SqlOverHttpError {
+    fn get_http_status_code(&self) -> StatusCode {
+        match self {
+            SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
+                ErrorKind::User => StatusCode::BAD_REQUEST,
+                _ => StatusCode::INTERNAL_SERVER_ERROR,
+            },
+            SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
+            SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
+            SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
+            SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index f598900af9..e59d46e352 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -561,7 +561,7 @@ def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
 
     # query generates a million rows - should hit the 10MB reponse limit quickly
     response = query(
-        400,
+        507,
         "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
     )
     assert "response is too large (max is 10485760 bytes)" in response["message"]

From 80e163004242ebb048447053a3fa3c9d432dd085 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Oct 2024 15:57:52 +0000
Subject: [PATCH 27/27] Use pg_mooncake from our fork. (#9565)

Switch to main repo once
https://github.com/Mooncake-Labs/pg_mooncake/pull/3 is merged
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1b2167ea11..85fb9c441d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1131,14 +1131,14 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
+ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \
         'v14') \
             echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
     esac && \
-    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
     cd pg_mooncake-src && \
     git checkout "${PG_MOONCAKE_VERSION}" && \
     git submodule update --init --depth 1 --recursive && \