From 65b4e6e7d6a89c1217c4c08c8118e252d3e17809 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 2 Jan 2024 11:24:52 +0300
Subject: [PATCH 01/21] Remove empty safekeeper init since truncateLsn.

It has caveats such as creating half empty segment which can't be
offloaded. Instead we'll pursue approach of pull_timeline, seeding new state
from some peer.
---
 pgxn/neon/walproposer.c                  | 34 ++++--------------------
 test_runner/regress/test_wal_acceptor.py |  2 +-
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 2ea724f927..1f7c473e7d 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-
-		if (sk->startStreamingAt < wp->truncateLsn)
-		{
-			/*
-			 * There's a gap between the WAL starting point and a truncateLsn,
-			 * which can't appear in a normal working cluster. That gap means
-			 * that all safekeepers reported that they have persisted WAL up
-			 * to the truncateLsn before, but now current safekeeper tells
-			 * otherwise.
-			 *
-			 * Also we have a special condition here, which is empty
-			 * safekeeper with no history. In combination with a gap, that can
-			 * happen when we introduce a new safekeeper to the cluster. This
-			 * is a rare case, which is triggered manually for now, and should
-			 * be treated with care.
-			 */
-
-			/*
-			 * truncateLsn will not change without ack from current
-			 * safekeeper, and it's aligned to the WAL record, so we can
-			 * safely start streaming from this point.
-			 */
-			sk->startStreamingAt = wp->truncateLsn;
-
-			wp_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-				   sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-				   LSN_FORMAT_ARGS(sk->startStreamingAt));
-		}
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
+		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
+		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
+	Assert(sk->startStreamingAt <= wp->availableLsn);
 
 	msg.tag = 'e';
 	msg.term = wp->propTerm;
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cf8df389c8..4dfc883f4c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
         f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
     )
 
-    endpoint.stop_and_destroy()
+    endpoint.stop()
     timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
 
     # Also delete and manually create timeline on safekeepers -- this tests

From edc962f1d7ac66bfb8dfe8f63d253d9389fc7b5c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 3 Jan 2024 14:22:17 +0000
Subject: [PATCH 02/21] test_runner: test_issue_5878 log allow list (#6259)

## Problem


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6254/7388706419/index.html#suites/5a4b8734277a9878cb429b80c314f470/e54c4f6f6ed22672

## Summary of changes

Permit the log message: because the test helper's detach function
increments the generation number, a detach/attach cycle can cause the
error if the test runner node is slow enough for the opportunistic
deletion queue flush on detach not to complete by the time we call
attach.
---
 test_runner/regress/test_layers_from_future.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 340188c1ae..999e077e45 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
 
     ps_http = env.pageserver.http_client()
 

From 753d97bd77f6d2c87f7167133585554eb463f01b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 10:12:34 +0000
Subject: [PATCH 03/21] pageserver: don't delete ancestor shard layers

---
 .../src/tenant/remote_timeline_client.rs      | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b0cf39fbe..60b40d70a7 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -818,8 +818,25 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
     ) {
+        // Filter out any layers which were not created by this tenant shard.  These are
+        // layers that originate from some ancestor shard after a split, and may still
+        // be referenced by other shards. We are free to delete them locally and remove
+        // them from our index (and would have already done so when we reach this point
+        // in the code), but we may not delete them remotely.
+        with_metadata.retain(|(name, meta)| {
+            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
+                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
+            if !retain {
+                tracing::debug!(
+                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
+                    meta.shard
+                );
+            }
+            retain
+        });
+
         for (name, meta) in &with_metadata {
             info!(
                 "scheduling deletion of layer {}{} (shard {})",

From 6c79e12630b1330e838af8eb07346550a8bf7912 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 27 Dec 2023 15:57:17 +0000
Subject: [PATCH 04/21] pageserver: drop unwanted keys during compaction after
 split

---
 libs/pageserver_api/src/keyspace.rs |  3 +++
 libs/pageserver_api/src/shard.rs    | 15 +++++++++++++
 pageserver/src/tenant/timeline.rs   | 33 ++++++++++++++++++++++++-----
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 16651c322e..80183506d8 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
                 if range.start == accum.end {
                     accum.end = range.end;
                 } else {
+                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
+                    // a new range here if the skipped region was all keys that don't belong on this shard.
+                    // (https://github.com/neondatabase/neon/issues/6247)
                     assert!(range.start > accum.end);
                     self.ranges.push(accum.clone());
                     *accum = range;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3e4936eec4..a186d93bce 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -422,6 +422,21 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be discarded if found in this shard's
+    /// data store, e.g. during compaction after a split
+    pub fn is_key_disposable(&self, key: &Key) -> bool {
+        if key_is_shard0(key) {
+            // Q: Why can't we dispose of shard0 content if we're not shard 0?
+            // A: because the WAL ingestion logic currently ingests some shard 0
+            //    content on all shards, even though it's only read on shard 0.  If we
+            //    dropped it, then subsequent WAL ingest to these keys would encounter
+            //    an error.
+            false
+        } else {
+            !self.is_key_local(key)
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 15a5ca1727..e8340a74b2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -496,6 +496,11 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
         // XXX: structured stats collection for layer eviction here.
         trace!(
             "get page request for {}@{} from task kind {:?}",
@@ -2224,13 +2229,13 @@ impl Timeline {
                     return Err(layer_traversal_error(
                         if cfg!(test) {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
-                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                             )
                         } else {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}",
-                                key, cont_lsn, request_lsn
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
                             )
                         },
                         traversal_path,
@@ -3054,6 +3059,15 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
+                        if self.shard_identity.is_key_disposable(&key) {
+                            debug!(
+                                "Dropping key {} during compaction (it belongs on shard {:?})",
+                                key,
+                                self.shard_identity.get_shard_number(&key)
+                            );
+                            key = key.next();
+                            continue;
+                        }
                         let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
@@ -3080,6 +3094,7 @@ impl Timeline {
                                 }
                             }
                         };
+
                         image_layer_writer.put_image(key, &img).await?;
                         key = key.next();
                     }
@@ -3650,7 +3665,15 @@ impl Timeline {
                 )))
             });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            if !self.shard_identity.is_key_disposable(&key) {
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
 
             if !new_layers.is_empty() {
                 fail_point!("after-timeline-compacted-first-L1");

From ef7c9c2ccc1a385f74455f45b54faa5b101065e6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 13:58:00 +0000
Subject: [PATCH 05/21] pageserver: fix active tenant lookup hitting
 secondaries with sharding

If there is some secondary shard for a tenant on the same
node as an attached shard, the secondary shard could trip up
this code and cause page_service to incorrectly
get an error instead of finding the attached shard.
---
 pageserver/src/tenant/mgr.rs | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 62922e8c99..250de7247d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -130,7 +130,7 @@ impl TenantsMap {
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
     /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
+    fn resolve_attached_shard(
         &self,
         tenant_id: &TenantId,
         selector: ShardSelector,
@@ -140,25 +140,27 @@ impl TenantsMap {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                 for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
                     match selector {
                         ShardSelector::First => return Some(*slot.0),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return Some(*slot.0)
                         }
                         ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
 
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
                             }
                         }
                         _ => continue,
@@ -1257,9 +1259,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
         let locked = TENANTS.read().unwrap();
 
         // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
             .map_err(GetTenantError::MapState)?;

From 34ebfbdd6f509f4bd2eab807c2730f987ba5b0df Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Dec 2023 15:13:00 +0000
Subject: [PATCH 06/21] pageserver: fix handling getpage with multiple shards
 on one node

Previously, we would wait for the LSN to be visible on whichever
timeline we happened to load at the start of the connection, then
proceed to look up the correct timeline for the key and do the read.

If the timeline holding the key was behind the timeline we used
for the LSN wait, then we might serve an apparently-successful read result
that actually contains data from behind the requested lsn.
---
 pageserver/src/page_service.rs | 41 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index db07a600e5..be9f478f25 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -802,7 +802,7 @@ impl PageServerHandler {
         }))
     }
 
-    async fn handle_get_page_at_lsn_request(
+    async fn do_handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
@@ -812,20 +812,25 @@ impl PageServerHandler {
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
-        /*
-        // Add a 1s delay to some requests. The delay helps the requests to
-        // hit the race condition from github issue #1047 more easily.
-        use rand::Rng;
-        if rand::thread_rng().gen::<u8>() < 5 {
-            std::thread::sleep(std::time::Duration::from_millis(1000));
-        }
-        */
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
 
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
+    }
+
+    async fn handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<PagestreamBeMessage> {
         let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-                .await?
+        if timeline.get_shard_identity().is_key_local(&key) {
+            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
+                .await
         } else {
             // The Tenant shard we looked up at connection start does not hold this particular
             // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -860,14 +865,10 @@ impl PageServerHandler {
             // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
             // the GateGuard was already held over the whole connection.
             let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-                .await?
-        };
 
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
+            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
+                .await
+        }
     }
 
     #[allow(clippy::too_many_arguments)]

From 73a944205b5d76efd791a3218004423204790789 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 22 Dec 2023 11:52:09 +0000
Subject: [PATCH 07/21] pageserver: log details on shard routing error

---
 pageserver/src/page_service.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index be9f478f25..d478d375f8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -857,6 +857,8 @@ impl PageServerHandler {
                     // informed yet.
                     //
                     // https://github.com/neondatabase/neon/issues/6038
+                    tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
                     return Err(anyhow::anyhow!("Request routed to wrong shard"));
                 }
                 Err(e) => return Err(e.into()),

From a2e083ebe04fd0283c26dba8d7fd9e981623aad9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Dec 2023 13:43:56 +0000
Subject: [PATCH 08/21] pageserver: make walredo shard-aware

This does not have a functional impact, but enables all
the logging in this code to include the shard_id
label.
---
 pageserver/benches/bench_walredo.rs |  5 ++-
 pageserver/src/tenant.rs            |  6 +--
 pageserver/src/walredo.rs           | 64 ++++++++++++++++-------------
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ba41866935..4837626086 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
+use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
-    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_id);
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
     let manager = Arc::new(manager);
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e50987c84b..1660de8923 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -595,10 +595,9 @@ impl Tenant {
         mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
 
         let TenantSharedResources {
@@ -1145,10 +1144,9 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
         Arc::new(Tenant::new(
             TenantState::Broken {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 94e95fd3b3..6918698f29 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-#[cfg(feature = "testing")]
-use pageserver_api::shard::TenantShardId;
-
 use crate::config::PageServerConf;
 use crate::metrics::{
     WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -92,7 +90,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
     redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
     ///
     /// Create a new PostgresRedoManager.
     ///
-    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
+    pub fn new(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> PostgresRedoManager {
         // The actual process is launched lazily, on first request.
         PostgresRedoManager {
-            tenant_id,
+            tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: RwLock::new(None),
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
                                 let timer =
                                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
-                                        .context("launch walredo process")?,
+                                    WalRedoProcess::launch(
+                                        self.conf,
+                                        self.tenant_shard_id,
+                                        pg_version,
+                                    )
+                                    .context("launch walredo process")?,
                                 );
                                 timer.observe_duration();
                                 *proc_guard = Some(Arc::clone(&proc));
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     // Some() on construction, only becomes None on Drop.
     child: Option<NoLeakChild>,
     stdout: Mutex<ProcessOutput>,
@@ -652,10 +657,10 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
     fn launch(
         conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -680,7 +685,7 @@ impl WalRedoProcess {
             // as close-on-exec by default, but that's not enough, since we use
             // libraries that directly call libc open without setting that flag.
             .close_fds()
-            .spawn_no_leak_child(tenant_id)
+            .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
         let mut child = scopeguard::guard(child, |child| {
@@ -741,12 +746,12 @@ impl WalRedoProcess {
                         error!(error=?e, "failed to read from walredo stderr");
                     }
                 }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
         );
 
         Ok(Self {
             conf,
-            tenant_id,
+            tenant_shard_id,
             child: Some(child),
             stdin: Mutex::new(ProcessInput {
                 stdin,
@@ -772,7 +777,7 @@ impl WalRedoProcess {
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     fn apply_wal_records(
         &self,
         tag: BufferTag,
@@ -966,11 +971,7 @@ impl WalRedoProcess {
         // these files will be collected to an allure report
         let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
 
-        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
-        let path = self
-            .conf
-            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
-            .join(&filename);
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
 
         let res = std::fs::OpenOptions::new()
             .write(true)
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
     child: Option<Child>,
 }
 
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
 }
 
 impl NoLeakChild {
-    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
         let child = command.spawn()?;
         Ok(NoLeakChild {
             tenant_id,
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
             Some(child) => child,
             None => return,
         };
-        let tenant_id = self.tenant_id;
+        let tenant_shard_id = self.tenant_id;
         // Offload the kill+wait of the child process into the background.
         // If someone stops the runtime, we'll leak the child process.
         // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
             tokio::task::spawn_blocking(move || {
                 // Intentionally don't inherit the tracing context from whoever is dropping us.
                 // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!("walredo", %tenant_id);
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
                 let _entered = span.enter();
                 Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
             })
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
 }
 
 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
 }
 
 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
         NoLeakChild::spawn(tenant_id, self)
     }
 }
@@ -1155,6 +1160,7 @@ mod tests {
     use crate::repository::Key;
     use crate::{config::PageServerConf, walrecord::NeonWalRecord};
     use bytes::Bytes;
+    use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use utils::{id::TenantId, lsn::Lsn};
 
@@ -1264,9 +1270,9 @@ mod tests {
             let repo_dir = camino_tempfile::tempdir()?;
             let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
             let conf = Box::leak(Box::new(conf));
-            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-            let manager = PostgresRedoManager::new(conf, tenant_id);
+            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
             Ok(RedoHarness {
                 _repo_dir: repo_dir,

From c119af8ddd019a5eebf37d28ce54c4424a1faab2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 27 Dec 2023 16:02:09 +0000
Subject: [PATCH 09/21] pageserver: run at least 2 background task threads

Otherwise an assertion in CONCURRENT_BACKGROUND_TASKS will
trip if you try to run the pageserver on a single core.
---
 pageserver/src/task_mgr.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index cb1b2b8011..eabb1d0022 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
     // else, but that has not been needed in a long time.
     std::env::var("TOKIO_WORKER_THREADS")
         .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });
 
 #[derive(Debug, Clone, Copy)]

From 7662df6ca0c5c0977571cb1fcae3746e6af4fbae Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Wed, 3 Jan 2024 11:36:38 -0800
Subject: [PATCH 10/21] Fix minimum backoff to 1ms

---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index e58c28d7d5..3a7c0f1bb6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,7 @@
 
 #define PageStoreTrace DEBUG5
 
-#define MIN_RECONNECT_INTERVAL_USEC 100
+#define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;

From 18e92081586c7b10c97b2981e8b26dfb24fe5095 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 4 Jan 2024 10:40:03 +0000
Subject: [PATCH 11/21] pageserver: improved error handling for shard routing
 error, timeline not found (#6262)

## Problem

- When a client requests a key that isn't found in any shard on the node
(edge case that only happens if a compute's config is out of date), we
should prompt them to reconnect (as this includes a backoff), since they
will not be able to complete the request until they eventually get a
correct pageserver connection string.
- QueryError::Other is used excessively: this contains a type-ambiguous
anyhow::Error and is logged very verbosely (including backtrace).

## Summary of changes

- Introduce PageStreamError to replace use of anyhow::Error in request
handlers for getpage, etc.
- Introduce Reconnect and NotFound variants to QueryError
- Map the "shard routing error" case to PageStreamError::Reconnect ->
QueryError::Reconnect
- Update type conversions for LSN timeouts and tenant/timeline not found
errors to use PageStreamError::NotFound->QueryError::NotFound
---
 libs/postgres_backend/src/lib.rs    |  23 +++-
 libs/utils/src/http/error.rs        |   7 ++
 pageserver/src/http/routes.rs       |   1 +
 pageserver/src/page_service.rs      | 177 ++++++++++++++++++++--------
 pageserver/src/pgdatadir_mapping.rs |  38 +++---
 pageserver/src/tenant.rs            |  11 +-
 pageserver/src/tenant/timeline.rs   |  99 ++++++++--------
 7 files changed, 237 insertions(+), 119 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 1dae008a4f..73d25619c3 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,6 +35,12 @@ pub enum QueryError {
     /// We were instructed to shutdown while processing the query
     #[error("Shutting down")]
     Shutdown,
+    /// Query handler indicated that client should reconnect
+    #[error("Server requested reconnect")]
+    Reconnect,
+    /// Query named an entity that was not found
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
     /// Authentication failure
     #[error("Unauthorized: {0}")]
     Unauthorized(std::borrow::Cow<'static, str>),
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
     pub fn pg_error_code(&self) -> &'static [u8; 5] {
         match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
             Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
             Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
         }
     }
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 info!("Stopped due to shutdown");
                 Ok(())
             }
+            Err(QueryError::Reconnect) => {
+                // Dropping out of this loop implicitly disconnects
+                info!("Stopped due to handler reconnect request");
+                Ok(())
+            }
             Err(QueryError::Disconnected(e)) => {
                 info!("Disconnected ({e:#})");
                 // Disconnection is not an error: we just use it that way internally to drop
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
     match e {
         QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Reconnect => "reconnect".to_string(),
         QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::NotFound(_) => "not found".to_string(),
         QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
         QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
         QueryError::Other(e) => format!("{e:#}"),
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
         QueryError::SimulatedConnectionError => {
             error!("query handler for query '{query}' failed due to a simulated connection error")
         }
+        QueryError::Reconnect => {
+            info!("query handler for '{query}' requested client to reconnect")
+        }
         QueryError::Shutdown => {
             info!("query handler for '{query}' cancelled during tenant shutdown")
         }
+        QueryError::NotFound(reason) => {
+            info!("query handler for '{query}' entity not found: {reason}")
+        }
         QueryError::Unauthorized(e) => {
             warn!("query handler for '{query}' failed with authentication error: {e}");
         }
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index ac68b04888..3e9281ac81 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,6 +31,9 @@ pub enum ApiError {
     #[error("Shutting down")]
     ShuttingDown,
 
+    #[error("Timeout")]
+    Timeout(Cow<'static, str>),
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -67,6 +70,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::SERVICE_UNAVAILABLE,
             ),
+            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::REQUEST_TIMEOUT,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 157e6b4e3e..8265627cb5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -152,6 +152,7 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::AncestorStopping(_) => {
                 ApiError::ResourceUnavailable(format!("{pre}").into())
             }
+            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d478d375f8..291490d016 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
+use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -61,6 +62,9 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::GetTimelineError;
+use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
@@ -283,6 +287,64 @@ struct PageServerHandler {
     connection_ctx: RequestContext,
 }
 
+#[derive(thiserror::Error, Debug)]
+enum PageStreamError {
+    /// We encountered an error that should prompt the client to reconnect:
+    /// in practice this means we drop the connection without sending a response.
+    #[error("Reconnect required: {0}")]
+    Reconnect(Cow<'static, str>),
+
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
+
+    /// Something went wrong reading a page: this likely indicates a pageserver bug
+    #[error("Read error: {0}")]
+    Read(PageReconstructError),
+
+    /// Ran out of time waiting for an LSN
+    #[error("LSN timeout: {0}")]
+    LsnTimeout(WaitLsnError),
+
+    /// The entity required to serve the request (tenant or timeline) is not found,
+    /// or is not found in a suitable state to serve a request.
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
+
+    /// Request asked for something that doesn't make sense, like an invalid LSN
+    #[error("Bad request: {0}")]
+    BadRequest(std::borrow::Cow<'static, str>),
+}
+
+impl From<PageReconstructError> for PageStreamError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::Shutdown,
+            e => Self::Read(e),
+        }
+    }
+}
+
+impl From<GetActiveTimelineError> for PageStreamError {
+    fn from(value: GetActiveTimelineError) -> Self {
+        match value {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
+        }
+    }
+}
+
+impl From<WaitLsnError> for PageStreamError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+        }
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
@@ -428,7 +490,7 @@ impl PageServerHandler {
         // Check that the timeline exists
         let timeline = tenant
             .get_timeline(timeline_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
 
         // Avoid starting new requests if the timeline has already started shutting down,
         // and block timeline shutdown until this request is complete, or drops out due
@@ -520,32 +582,44 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+            match response {
+                Err(PageStreamError::Shutdown) => {
                     // If we fail to fulfil a request during shutdown, which may be _because_ of
                     // shutdown, then do not send the error to the client.  Instead just drop the
                     // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
+                    span.in_scope(|| info!("dropping connection due to shutdown"));
                     return Err(QueryError::Shutdown);
                 }
+                Err(PageStreamError::Reconnect(reason)) => {
+                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    return Err(QueryError::Reconnect);
+                }
+                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
+                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
+                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
+                    //
+                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                    // because wait_lsn etc will drop out
+                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                    // is_canceled(): [`Timeline::shutdown`]` has entered
+                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
+                    return Err(QueryError::Shutdown);
+                }
+                r => {
+                    let response_msg = r.unwrap_or_else(|e| {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    });
+
+                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+                    self.flush_cancellable(pgb, &timeline.cancel).await?;
+                }
             }
-
-            let response = response.unwrap_or_else(|e| {
-                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
-                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
-
-            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
         }
         Ok(())
     }
@@ -692,7 +766,7 @@ impl PageServerHandler {
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Lsn> {
+    ) -> Result<Lsn, PageStreamError> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -723,15 +797,19 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                anyhow::bail!("invalid LSN(0) in request");
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
             }
             timeline.wait_lsn(lsn, ctx).await?;
         }
-        anyhow::ensure!(
-            lsn >= **latest_gc_cutoff_lsn,
-            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-            lsn, **latest_gc_cutoff_lsn
-        );
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
         Ok(lsn)
     }
 
@@ -740,7 +818,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -760,7 +838,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -780,7 +858,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -807,7 +885,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -826,7 +904,7 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let key = rel_block_to_key(req.rel, req.blkno);
         if timeline.get_shard_identity().is_key_local(&key) {
             self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
@@ -849,24 +927,26 @@ impl PageServerHandler {
                 Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                     // We already know this tenant exists in general, because we resolved it at
                     // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    tracing::warn!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                    // mapping is out of date.
+                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
                         timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                    // and talk to a different pageserver.
+                    return Err(PageStreamError::Reconnect(
+                        "getpage@lsn request routed to wrong shard".into(),
+                    ));
                 }
                 Err(e) => return Err(e.into()),
             };
 
             // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
             // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            let _timeline_guard = timeline
+                .gate
+                .enter()
+                .map_err(|_| PageStreamError::Shutdown)?;
 
             self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
                 .await
@@ -1011,9 +1091,7 @@ impl PageServerHandler {
         )
         .await
         .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
         Ok(timeline)
     }
 }
@@ -1435,14 +1513,15 @@ enum GetActiveTimelineError {
     #[error(transparent)]
     Tenant(GetActiveTenantError),
     #[error(transparent)]
-    Timeline(anyhow::Error),
+    Timeline(#[from] GetTimelineError),
 }
 
 impl From<GetActiveTimelineError> for QueryError {
     fn from(e: GetActiveTimelineError) -> Self {
         match e {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
             GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
         }
     }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 9fe75e5baf..f11a72f2ab 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -160,7 +160,7 @@ impl Timeline {
     //------------------------------------------------------------------------------
 
     /// Look up given page version.
-    pub async fn get_rel_page_at_lsn(
+    pub(crate) async fn get_rel_page_at_lsn(
         &self,
         tag: RelTag,
         blknum: BlockNumber,
@@ -191,7 +191,7 @@ impl Timeline {
     }
 
     // Get size of a database in blocks
-    pub async fn get_db_size(
+    pub(crate) async fn get_db_size(
         &self,
         spcnode: Oid,
         dbnode: Oid,
@@ -211,7 +211,7 @@ impl Timeline {
     }
 
     /// Get size of a relation file
-    pub async fn get_rel_size(
+    pub(crate) async fn get_rel_size(
         &self,
         tag: RelTag,
         version: Version<'_>,
@@ -256,7 +256,7 @@ impl Timeline {
     }
 
     /// Does relation exist?
-    pub async fn get_rel_exists(
+    pub(crate) async fn get_rel_exists(
         &self,
         tag: RelTag,
         version: Version<'_>,
@@ -291,7 +291,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn list_rels(
+    pub(crate) async fn list_rels(
         &self,
         spcnode: Oid,
         dbnode: Oid,
@@ -319,7 +319,7 @@ impl Timeline {
     }
 
     /// Look up given SLRU page version.
-    pub async fn get_slru_page_at_lsn(
+    pub(crate) async fn get_slru_page_at_lsn(
         &self,
         kind: SlruKind,
         segno: u32,
@@ -332,7 +332,7 @@ impl Timeline {
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_size(
+    pub(crate) async fn get_slru_segment_size(
         &self,
         kind: SlruKind,
         segno: u32,
@@ -345,7 +345,7 @@ impl Timeline {
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_exists(
+    pub(crate) async fn get_slru_segment_exists(
         &self,
         kind: SlruKind,
         segno: u32,
@@ -372,7 +372,7 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub async fn find_lsn_for_timestamp(
+    pub(crate) async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
         cancel: &CancellationToken,
@@ -452,7 +452,7 @@ impl Timeline {
     /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
     /// with a smaller/larger timestamp.
     ///
-    pub async fn is_latest_commit_timestamp_ge_than(
+    pub(crate) async fn is_latest_commit_timestamp_ge_than(
         &self,
         search_timestamp: TimestampTz,
         probe_lsn: Lsn,
@@ -475,7 +475,7 @@ impl Timeline {
     /// Obtain the possible timestamp range for the given lsn.
     ///
     /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
+    pub(crate) async fn get_timestamp_for_lsn(
         &self,
         probe_lsn: Lsn,
         ctx: &RequestContext,
@@ -532,7 +532,7 @@ impl Timeline {
     }
 
     /// Get a list of SLRU segments
-    pub async fn list_slru_segments(
+    pub(crate) async fn list_slru_segments(
         &self,
         kind: SlruKind,
         version: Version<'_>,
@@ -548,7 +548,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_relmap_file(
+    pub(crate) async fn get_relmap_file(
         &self,
         spcnode: Oid,
         dbnode: Oid,
@@ -561,7 +561,7 @@ impl Timeline {
         Ok(buf)
     }
 
-    pub async fn list_dbdirs(
+    pub(crate) async fn list_dbdirs(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -575,7 +575,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_twophase_file(
+    pub(crate) async fn get_twophase_file(
         &self,
         xid: TransactionId,
         lsn: Lsn,
@@ -586,7 +586,7 @@ impl Timeline {
         Ok(buf)
     }
 
-    pub async fn list_twophase_files(
+    pub(crate) async fn list_twophase_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -600,7 +600,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(
+    pub(crate) async fn get_control_file(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -608,7 +608,7 @@ impl Timeline {
         self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(
+    pub(crate) async fn get_checkpoint(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -616,7 +616,7 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    pub async fn list_aux_files(
+    pub(crate) async fn list_aux_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1660de8923..7c609452e5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -56,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
+use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -1758,7 +1759,15 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
+                    ancestor_timeline
+                        .wait_lsn(*lsn, ctx)
+                        .await
+                        .map_err(|e| match e {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
+                            }
+                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
+                        })?;
                 }
 
                 self.branch_timeline(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e8340a74b2..24a92859b7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -373,15 +373,20 @@ pub struct GcInfo {
 }
 
 /// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
     /// The operation was cancelled
+    #[error("Cancelled")]
     Cancelled,
 
     /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
     AncestorStopping(TimelineId),
 
     /// An error happened replaying WAL records
@@ -402,32 +407,6 @@ enum FlushLayerError {
     Other(#[from] anyhow::Error),
 }
 
-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
-impl std::fmt::Display for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum WaitLsnError {
+    // Called on a timeline which is shutting down
+    #[error("Shutdown")]
+    Shutdown,
+
+    // Called on an timeline not in active state or shutting down
+    #[error("Bad state (not active)")]
+    BadState,
+
+    // Timeout expired while waiting for LSN to catch up with goal.
+    #[error("{0}")]
+    Timeout(String),
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -486,7 +480,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get(
+    pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
@@ -634,24 +628,28 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(
+    pub(crate) async fn wait_lsn(
         &self,
         lsn: Lsn,
         _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+    ) -> Result<(), WaitLsnError> {
+        if self.cancel.is_cancelled() {
+            return Err(WaitLsnError::Shutdown);
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
+        }
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
@@ -665,18 +663,22 @@ impl Timeline {
         {
             Ok(()) => Ok(()),
             Err(e) => {
-                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                drop(_timer);
-                let walreceiver_status = self.walreceiver_status();
-                Err(anyhow::Error::new(e).context({
-                    format!(
+                use utils::seqwait::SeqWaitError::*;
+                match e {
+                    Shutdown => Err(WaitLsnError::Shutdown),
+                    Timeout => {
+                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                        drop(_timer);
+                        let walreceiver_status = self.walreceiver_status();
+                        Err(WaitLsnError::Timeout(format!(
                         "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                         lsn,
                         self.get_last_record_lsn(),
                         self.get_disk_consistent_lsn(),
                         walreceiver_status,
-                    )
-                }))
+                    )))
+                    }
+                }
             }
         }
     }
@@ -2295,11 +2297,12 @@ impl Timeline {
                 ancestor
                     .wait_lsn(timeline.ancestor_lsn, ctx)
                     .await
-                    .with_context(|| {
-                        format!(
-                            "wait for lsn {} on ancestor timeline_id={}",
-                            timeline.ancestor_lsn, ancestor.timeline_id
-                        )
+                    .map_err(|e| match e {
+                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
+                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
+                        e @ WaitLsnError::BadState => {
+                            PageReconstructError::Other(anyhow::anyhow!(e))
+                        }
                     })?;
 
                 timeline_owned = ancestor;
@@ -4228,7 +4231,7 @@ impl Timeline {
                     .context("Failed to reconstruct a page image:")
                 {
                     Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::from(e)),
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                 };
 
                 if img.len() == page_cache::PAGE_SZ {

From f3b5db1443f7c55fc5dcae0032970846f5c69b7e Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 4 Jan 2024 21:40:38 +0400
Subject: [PATCH 12/21] Add API for safekeeper timeline copy (#6091)

Implement API for cloning a single timeline inside a safekeeper. Also
add API for calculating a sha256 hash of WAL, which is used in tests.

`/copy` API works by copying objects inside S3 for all but the last
segments, and the last segments are copied on-disk. A special temporary
directory is created for a timeline, because copy can take a lot of
time, especially for large timelines. After all files segments have been
prepared, this directory is mounted to the main tree and timeline is
loaded to memory.

Some caveats:
- large timelines can take a lot of time to copy, because we need to
copy many S3 segments
- caller should wait for HTTP call to finish indefinetely and don't
close the HTTP connection, because it will stop the process, which is
not continued in the background
- `until_lsn` must be a valid LSN, otherwise bad things can happen
- API will return 200 if specified `timeline_id` already exists, even if
it's not a copy
- each safekeeper will try to copy S3 segments, so it's better to not
call this API in-parallel on different safekeepers
---
 Cargo.lock                                   |   1 +
 libs/remote_storage/src/azure_blob.rs        |   6 +
 libs/remote_storage/src/lib.rs               |  13 +
 libs/remote_storage/src/local_fs.rs          |  14 ++
 libs/remote_storage/src/s3_bucket.rs         |  32 +++
 libs/remote_storage/src/s3_bucket/metrics.rs |   8 +-
 libs/remote_storage/src/simulate_failures.rs |   7 +
 libs/safekeeper_api/src/models.rs            |   6 +
 safekeeper/Cargo.toml                        |   1 +
 safekeeper/src/control_file.rs               |   7 +-
 safekeeper/src/copy_timeline.rs              | 250 +++++++++++++++++++
 safekeeper/src/debug_dump.rs                 |  57 +++++
 safekeeper/src/http/routes.rs                |  65 ++++-
 safekeeper/src/lib.rs                        |   1 +
 safekeeper/src/pull_timeline.rs              | 131 +++++++---
 safekeeper/src/timeline.rs                   |   3 +-
 safekeeper/src/timelines_global_map.rs       |  15 +-
 safekeeper/src/wal_backup.rs                 |  61 ++++-
 safekeeper/src/wal_storage.rs                |   2 +-
 test_runner/fixtures/neon_fixtures.py        |  22 ++
 test_runner/regress/test_wal_acceptor.py     |  80 ++++++
 21 files changed, 727 insertions(+), 55 deletions(-)
 create mode 100644 safekeeper/src/copy_timeline.rs

diff --git a/Cargo.lock b/Cargo.lock
index 93efbadd79..4dd195a895 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4475,6 +4475,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
+ "sha2",
  "signal-hook",
  "storage_broker",
  "thiserror",
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 7ea1103eb2..18cf5d97ba 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage {
         }
         Ok(())
     }
+
+    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
+        Err(anyhow::anyhow!(
+            "copy for azure blob storage is not implemented"
+        ))
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3e408e3119..942d0016b0 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
 
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+
+    /// Copy a remote object inside a bucket from one path to another.
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
             Self::Unreliable(s) => s.delete_objects(paths).await,
         }
     }
+
+    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.copy(from, to).await,
+            Self::AwsS3(s) => s.copy(from, to).await,
+            Self::AzureBlob(s) => s.copy(from, to).await,
+            Self::Unreliable(s) => s.copy(from, to).await,
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
             RequestKind::Put => &self.write,
             RequestKind::List => &self.read,
             RequestKind::Delete => &self.write,
+            RequestKind::Copy => &self.write,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d1e7d325b9..bf8b6b5dde 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let from_path = from.with_base(&self.storage_root);
+        let to_path = to.with_base(&self.storage_root);
+        create_target_directory(&to_path).await?;
+        fs::copy(&from_path, &to_path).await.with_context(|| {
+            format!(
+                "Failed to copy file from '{from_path}' to '{to_path}'",
+                from_path = from_path,
+                to_path = to_path
+            )
+        })?;
+        Ok(())
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0f95458ad1..d7b41edaaf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let kind = RequestKind::Copy;
+        let _guard = self.permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);
+
+        // we need to specify bucket_name as a prefix
+        let copy_source = format!(
+            "{}/{}",
+            self.bucket_name,
+            self.relative_path_to_s3_object(from)
+        );
+
+        let res = self
+            .client
+            .copy_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(to))
+            .copy_source(copy_source)
+            .send()
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
+        Ok(())
+    }
+
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
         // if prefix is not none then download file `prefix/from`
         // if prefix is none then download file `from`
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs
index ea11edafa5..21dde14906 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
     Put = 1,
     Delete = 2,
     List = 3,
+    Copy = 4,
 }
 
 use RequestKind::*;
@@ -22,6 +23,7 @@ impl RequestKind {
             Put => "put_object",
             Delete => "delete_object",
             List => "list_objects",
+            Copy => "copy_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -29,7 +31,7 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 4]);
+pub(super) struct RequestTyped<C>([C; 5]);
 
 impl<C> RequestTyped<C> {
     pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy].into_iter();
+        let arr = std::array::from_fn::<C, 5, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 802b0db7f5..7f5adcea30 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        // copy is equivalent to download + upload
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.copy_object(from, to).await
+    }
 }
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 786712deb1..ce5a1e411e 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
     #[serde(default)]
     pub http_connstr: Option<String>,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineCopyRequest {
+    pub target_timeline_id: TimelineId,
+    pub until_lsn: Lsn,
+}
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 4015c27933..364cad7892 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -54,6 +54,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 7aadd67ac6..591bfea182 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -66,12 +66,10 @@ impl FileStorage {
 
     /// Create file storage for a new timeline, but don't persist it yet.
     pub fn create_new(
-        ttid: &TenantTimelineId,
+        timeline_dir: Utf8PathBuf,
         conf: &SafeKeeperConf,
         state: SafeKeeperState,
     ) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
         let store = FileStorage {
             timeline_dir,
             conf: conf.clone(),
@@ -277,7 +275,8 @@ mod test {
             .await
             .expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
-        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
new file mode 100644
index 0000000000..ef88eb27e3
--- /dev/null
+++ b/safekeeper/src/copy_timeline.rs
@@ -0,0 +1,250 @@
+use std::sync::Arc;
+
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+
+use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use tokio::{
+    fs::OpenOptions,
+    io::{AsyncSeekExt, AsyncWriteExt},
+};
+use tracing::{info, warn};
+use utils::{id::TenantTimelineId, lsn::Lsn};
+
+use crate::{
+    control_file::{FileStorage, Storage},
+    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
+    safekeeper::SafeKeeperState,
+    timeline::{Timeline, TimelineError},
+    wal_backup::copy_s3_segments,
+    wal_storage::{wal_file_paths, WalReader},
+    GlobalTimelines, SafeKeeperConf,
+};
+
+// we don't want to have more than 10 segments on disk after copy, because they take space
+const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
+
+pub struct Request {
+    pub source: Arc<Timeline>,
+    pub until_lsn: Lsn,
+    pub destination_ttid: TenantTimelineId,
+}
+
+pub async fn handle_request(request: Request) -> Result<()> {
+    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
+    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
+
+    match GlobalTimelines::get(request.destination_ttid) {
+        // timeline already exists. would be good to check that this timeline is the copy
+        // of the source timeline, but it isn't obvious how to do that
+        Ok(_) => return Ok(()),
+        // timeline not found, we are going to create it
+        Err(TimelineError::NotFound(_)) => {}
+        // error, probably timeline was deleted
+        res => {
+            res?;
+        }
+    }
+
+    let conf = &GlobalTimelines::get_global_config();
+    let ttid = request.destination_ttid;
+
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
+
+    let (mem_state, state) = request.source.get_state().await;
+    let start_lsn = state.timeline_start_lsn;
+    if start_lsn == Lsn::INVALID {
+        bail!("timeline is not initialized");
+    }
+    let backup_lsn = mem_state.backup_lsn;
+
+    {
+        let commit_lsn = mem_state.commit_lsn;
+        let flush_lsn = request.source.get_flush_lsn().await;
+
+        info!(
+            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
+            start_lsn, backup_lsn, commit_lsn, flush_lsn
+        );
+
+        assert!(backup_lsn >= start_lsn);
+        assert!(commit_lsn >= start_lsn);
+        assert!(flush_lsn >= start_lsn);
+
+        if request.until_lsn > flush_lsn {
+            bail!("requested LSN is beyond the end of the timeline");
+        }
+        if request.until_lsn < start_lsn {
+            bail!("requested LSN is before the start of the timeline");
+        }
+
+        if request.until_lsn > commit_lsn {
+            warn!("copy_timeline WAL is not fully committed");
+        }
+
+        if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG {
+            // we have a lot of segments that are not backed up. we can try to wait here until
+            // segments will be backed up to remote storage, but it's not clear how long to wait
+            bail!("too many segments are not backed up");
+        }
+    }
+
+    let wal_seg_size = state.server.wal_seg_size as usize;
+    if wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = request.until_lsn.segment_number(wal_seg_size);
+
+    let new_backup_lsn = {
+        // we can't have new backup_lsn greater than existing backup_lsn or start of the last segment
+        let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64));
+
+        if max_backup_lsn <= start_lsn {
+            // probably we are starting from the first segment, which was not backed up yet.
+            // note that start_lsn can be in the middle of the segment
+            start_lsn
+        } else {
+            // we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up
+            assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0);
+            max_backup_lsn
+        }
+    };
+
+    // all previous segments will be copied inside S3
+    let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size);
+    assert!(first_ondisk_segment <= last_segment);
+    assert!(first_ondisk_segment >= first_segment);
+
+    copy_s3_segments(
+        wal_seg_size,
+        &request.source.ttid,
+        &request.destination_ttid,
+        first_segment,
+        first_ondisk_segment,
+    )
+    .await?;
+
+    copy_disk_segments(
+        conf,
+        &state,
+        wal_seg_size,
+        &request.source.ttid,
+        new_backup_lsn,
+        request.until_lsn,
+        &tli_dir_path,
+    )
+    .await?;
+
+    let mut new_state = SafeKeeperState::new(
+        &request.destination_ttid,
+        state.server.clone(),
+        vec![],
+        request.until_lsn,
+        start_lsn,
+    );
+    new_state.timeline_start_lsn = start_lsn;
+    new_state.peer_horizon_lsn = request.until_lsn;
+    new_state.backup_lsn = new_backup_lsn;
+
+    let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
+    file_storage.persist(&new_state).await?;
+
+    // now we have a ready timeline in a temp directory
+    validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+    load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+
+    Ok(())
+}
+
+async fn copy_disk_segments(
+    conf: &SafeKeeperConf,
+    persisted_state: &SafeKeeperState,
+    wal_seg_size: usize,
+    source_ttid: &TenantTimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+    tli_dir_path: &Utf8PathBuf,
+) -> Result<()> {
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        conf.timeline_dir(source_ttid),
+        persisted_state,
+        start_lsn,
+        true,
+    )?;
+
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = end_lsn.segment_number(wal_seg_size);
+
+    for segment in first_segment..=last_segment {
+        let segment_start = segment * wal_seg_size as u64;
+        let segment_end = segment_start + wal_seg_size as u64;
+
+        let copy_start = segment_start.max(start_lsn.0);
+        let copy_end = segment_end.min(end_lsn.0);
+
+        let copy_start = copy_start - segment_start;
+        let copy_end = copy_end - segment_start;
+
+        let wal_file_path = {
+            let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
+
+            if segment == last_segment {
+                partial
+            } else {
+                normal
+            }
+        };
+
+        write_segment(
+            &mut buf,
+            &wal_file_path,
+            wal_seg_size as u64,
+            copy_start,
+            copy_end,
+            &mut wal_reader,
+        )
+        .await?;
+    }
+
+    Ok(())
+}
+
+async fn write_segment(
+    buf: &mut [u8],
+    file_path: &Utf8PathBuf,
+    wal_seg_size: u64,
+    from: u64,
+    to: u64,
+    reader: &mut WalReader,
+) -> Result<()> {
+    assert!(from <= to);
+    assert!(to <= wal_seg_size);
+
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .open(&file_path)
+        .await?;
+
+    // maybe fill with zeros, as in wal_storage.rs?
+    file.set_len(wal_seg_size).await?;
+    file.seek(std::io::SeekFrom::Start(from)).await?;
+
+    let mut bytes_left = to - from;
+    while bytes_left > 0 {
+        let len = bytes_left as usize;
+        let len = len.min(buf.len());
+        let len = reader.read(&mut buf[..len]).await?;
+        file.write_all(&buf[..len]).await?;
+        bytes_left -= len as u64;
+    }
+
+    file.flush().await?;
+    file.sync_all().await?;
+    Ok(())
+}
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index daf9255ecb..c9ff1afdea 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -7,13 +7,16 @@ use std::io::Read;
 use std::path::PathBuf;
 use std::sync::Arc;
 
+use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
+use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
 use utils::id::{TenantId, TimelineId};
@@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory;
 use crate::SafeKeeperConf;
 
 use crate::send_wal::WalSenderState;
+use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 
 /// Various filters that influence the resulting JSON output.
@@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config {
         wal_backup_enabled: config.wal_backup_enabled,
     }
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineDigestRequest {
+    pub from_lsn: Lsn,
+    pub until_lsn: Lsn,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineDigest {
+    pub sha256: String,
+}
+
+pub async fn calculate_digest(
+    tli: &Arc<crate::timeline::Timeline>,
+    request: TimelineDigestRequest,
+) -> Result<TimelineDigest> {
+    if request.from_lsn > request.until_lsn {
+        bail!("from_lsn is greater than until_lsn");
+    }
+
+    let conf = GlobalTimelines::get_global_config();
+    let (_, persisted_state) = tli.get_state().await;
+
+    if persisted_state.timeline_start_lsn > request.from_lsn {
+        bail!("requested LSN is before the start of the timeline");
+    }
+
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        tli.timeline_dir.clone(),
+        &persisted_state,
+        request.from_lsn,
+        true,
+    )?;
+
+    let mut hasher = Sha256::new();
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
+    while bytes_left > 0 {
+        let bytes_to_read = std::cmp::min(buf.len(), bytes_left);
+        let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?;
+        if bytes_read == 0 {
+            bail!("wal_reader.read returned 0 bytes");
+        }
+        hasher.update(&buf[..bytes_read]);
+        bytes_left -= bytes_read;
+    }
+
+    let digest = hasher.finalize();
+    let digest = hex::encode(digest);
+    Ok(TimelineDigest { sha256: digest })
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 25a3334e63..5283ea19c1 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
 
 use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::SkTimelineInfo;
+use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::fmt;
@@ -14,19 +14,21 @@ use tokio::fs::File;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::request::parse_query_param;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::info_span;
+use tracing::{info_span, Instrument};
 use utils::http::endpoint::{request_span, ChannelWriter};
 
+use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -204,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, resp)
 }
 
+async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let request_data: TimelineCopyRequest = json_request(&mut request).await?;
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "source_timeline_id")?,
+    );
+
+    let source = GlobalTimelines::get(ttid)?;
+
+    copy_timeline::handle_request(copy_timeline::Request{
+        source,
+        until_lsn: request_data.until_lsn,
+        destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
+    })
+        .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
+    let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
+
+    let request = TimelineDigestRequest {
+        from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "from_lsn is required"
+        )))?,
+        until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "until_lsn is required"
+        )))?,
+    };
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let response = debug_dump::calculate_digest(&tli, request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, response)
+}
+
 /// Download a file from the timeline directory.
 // TODO: figure out a better way to copy files between safekeepers
 async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -472,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
             |r| request_span(r, timeline_files_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
+            |r| request_span(r, timeline_copy_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
         .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
+            request_span(r, timeline_digest_handler)
+        })
 }
 
 #[cfg(test)]
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 3a086f1f54..fc5f99eb00 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -16,6 +16,7 @@ mod auth;
 pub mod broker;
 pub mod control_file;
 pub mod control_file_upgrade;
+pub mod copy_timeline;
 pub mod debug_dump;
 pub mod handler;
 pub mod http;
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index ad3a18a536..93b51f32c0 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,16 +1,24 @@
+use std::sync::Arc;
+
+use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 
 use anyhow::{bail, Context, Result};
 use tokio::io::AsyncWriteExt;
 use tracing::info;
-use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
 
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
+    timeline::{Timeline, TimelineError},
     wal_storage::{self, Storage},
-    GlobalTimelines,
+    GlobalTimelines, SafeKeeperConf,
 };
 
 /// Info about timeline on safekeeper ready for reporting.
@@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result<Response> {
 async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
-        "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
+        "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
         ttid,
         host,
         status.commit_lsn,
@@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
     if dump.timelines.len() != 1 {
         bail!(
-            "Expected to fetch single timeline, got {} timelines",
+            "expected to fetch single timeline, got {} timelines",
             dump.timelines.len()
         );
     }
 
     let timeline = dump.timelines.into_iter().next().unwrap();
     let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
-        "Timeline {} doesn't have disk content",
+        "timeline {} doesn't have disk content",
         ttid
     ))?;
 
@@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     filenames.insert(0, "safekeeper.control".to_string());
 
     info!(
-        "Downloading {} files from safekeeper {}",
+        "downloading {} files from safekeeper {}",
         filenames.len(),
         host
     );
 
-    // Creating temp directory for a new timeline. It needs to be
-    // located on the same filesystem as the rest of the timelines.
-
-    // conf.workdir is usually /storage/safekeeper/data
-    // will try to transform it into /storage/safekeeper/tmp
-    let temp_base = conf
-        .workdir
-        .parent()
-        .ok_or(anyhow::anyhow!("workdir has no parent"))?
-        .join("tmp");
-
-    tokio::fs::create_dir_all(&temp_base).await?;
-
-    let tli_dir = camino_tempfile::Builder::new()
-        .suffix("_temptli")
-        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
-        .tempdir_in(temp_base)?;
-    let tli_dir_path = tli_dir.path().to_path_buf();
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
     // Note: some time happens between fetching list of files and fetching files themselves.
     //       It's possible that some files will be removed from safekeeper and we will fail to fetch them.
@@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     // TODO: fsync?
 
     // Let's create timeline from temp directory and verify that it's correct
+    let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
+    info!(
+        "finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
+        ttid, commit_lsn, flush_lsn
+    );
+    assert!(status.commit_lsn <= status.flush_lsn);
 
-    let control_path = tli_dir_path.join("safekeeper.control");
+    // Finally, load the timeline.
+    let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
+
+    Ok(Response {
+        safekeeper_host: host,
+    })
+}
+
+/// Create temp directory for a new timeline. It needs to be located on the same
+/// filesystem as the rest of the timelines. It will be automatically deleted when
+/// Utf8TempDir goes out of scope.
+pub async fn create_temp_timeline_dir(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+) -> Result<(Utf8TempDir, Utf8PathBuf)> {
+    // conf.workdir is usually /storage/safekeeper/data
+    // will try to transform it into /storage/safekeeper/tmp
+    let temp_base = conf
+        .workdir
+        .parent()
+        .ok_or(anyhow::anyhow!("workdir has no parent"))?
+        .join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = camino_tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+
+    let tli_dir_path = tli_dir.path().to_path_buf();
+
+    Ok((tli_dir, tli_dir_path))
+}
+
+/// Do basic validation of a temp timeline, before moving it to the global map.
+pub async fn validate_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    path: &Utf8PathBuf,
+) -> Result<(Lsn, Lsn)> {
+    let control_path = path.join("safekeeper.control");
 
     let control_store = control_file::FileStorage::load_control_file(control_path)?;
     if control_store.server.wal_seg_size == 0 {
         bail!("wal_seg_size is not set");
     }
 
-    let wal_store =
-        wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
 
-    let commit_lsn = status.commit_lsn;
+    let commit_lsn = control_store.commit_lsn;
     let flush_lsn = wal_store.flush_lsn();
 
-    info!(
-        "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
-        ttid, commit_lsn, flush_lsn
-    );
-    assert!(status.commit_lsn <= status.flush_lsn);
+    Ok((commit_lsn, flush_lsn))
+}
+
+/// Move timeline from a temp directory to the main storage, and load it to the global map.
+/// This operation is done under a lock to prevent bugs if several concurrent requests are
+/// trying to load the same timeline. Note that it doesn't guard against creating the
+/// timeline with the same ttid, but no one should be doing this anyway.
+pub async fn load_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    tmp_path: &Utf8PathBuf,
+) -> Result<Arc<Timeline>> {
+    // Take a lock to prevent concurrent loadings
+    let load_lock = GlobalTimelines::loading_lock().await;
+    let guard = load_lock.lock().await;
+
+    if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
+        bail!("timeline already exists, cannot overwrite it")
+    }
 
     // Move timeline dir to the correct location
     let timeline_path = conf.timeline_dir(&ttid);
 
     info!(
-        "Moving timeline {} from {} to {}",
-        ttid, tli_dir_path, timeline_path
+        "moving timeline {} from {} to {}",
+        ttid, tmp_path, timeline_path
     );
     tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
-    tokio::fs::rename(tli_dir_path, &timeline_path).await?;
+    tokio::fs::rename(tmp_path, &timeline_path).await?;
 
-    let tli = GlobalTimelines::load_timeline(ttid)
+    let tli = GlobalTimelines::load_timeline(&guard, ttid)
         .await
         .context("Failed to load timeline after copy")?;
 
     info!(
-        "Loaded timeline {}, flush_lsn={}",
+        "loaded timeline {}, flush_lsn={}",
         ttid,
         tli.get_flush_lsn().await
     );
 
-    Ok(Response {
-        safekeeper_host: host,
-    })
+    Ok(tli)
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index bdc9088138..2f284abe8c 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -141,7 +141,8 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
         let wal_store =
             wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index cbb3342e40..92ac5ba66d 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -21,8 +21,12 @@ struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
     wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
+// Used to prevent concurrent timeline loading.
+pub struct TimelineLoadLock;
+
 impl GlobalTimelinesState {
     /// Get configuration, which must be set once during init.
     fn get_conf(&self) -> &SafeKeeperConf {
@@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         timelines: HashMap::new(),
         wal_backup_launcher_tx: None,
         conf: None,
+        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
 
@@ -174,8 +179,16 @@ impl GlobalTimelines {
         Ok(())
     }
 
+    /// Take a lock for timeline loading.
+    pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
+        TIMELINES_STATE.lock().unwrap().load_lock.clone()
+    }
+
     /// Load timeline from disk to the memory.
-    pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub async fn load_timeline<'a>(
+        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
+        ttid: TenantTimelineId,
+    ) -> Result<Arc<Timeline>> {
         let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
         match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index c99bbc7d61..e4499eaf50 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -7,7 +7,7 @@ use tokio::task::JoinHandle;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -531,3 +531,62 @@ pub async fn read_object(
 
     Ok(Box::pin(reader))
 }
+
+/// Copy segments from one timeline to another. Used in copy_timeline.
+pub async fn copy_s3_segments(
+    wal_seg_size: usize,
+    src_ttid: &TenantTimelineId,
+    dst_ttid: &TenantTimelineId,
+    from_segment: XLogSegNo,
+    to_segment: XLogSegNo,
+) -> Result<()> {
+    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
+
+    let storage = REMOTE_STORAGE
+        .get()
+        .expect("failed to get remote storage")
+        .as_ref()
+        .unwrap();
+
+    let relative_dst_path =
+        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
+
+    let remote_path = RemotePath::new(&relative_dst_path)?;
+
+    let files = storage.list_files(Some(&remote_path)).await?;
+    let uploaded_segments = &files
+        .iter()
+        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
+        .collect::<HashSet<_>>();
+
+    debug!(
+        "these segments have already been uploaded: {:?}",
+        uploaded_segments
+    );
+
+    let relative_src_path =
+        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
+
+    for segno in from_segment..to_segment {
+        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
+            info!("copied all segments from {} until {}", from_segment, segno);
+        }
+
+        let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+        if uploaded_segments.contains(&segment_name) {
+            continue;
+        }
+        debug!("copying segment {}", segment_name);
+
+        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
+        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+
+        storage.copy_object(&from, &to).await?;
+    }
+
+    info!(
+        "finished copying segments from {} until {}",
+        from_segment, to_segment
+    );
+    Ok(())
+}
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index e7538f805c..8d138c701f 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -728,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
 }
 
 /// Helper returning full path to WAL segment file and its .partial brother.
-fn wal_file_paths(
+pub fn wal_file_paths(
     timeline_dir: &Utf8Path,
     segno: XLogSegNo,
     wal_seg_size: usize,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b1a8ba27d..f33e17a76a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3032,6 +3032,28 @@ class SafekeeperHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 4dfc883f4c..b4ce633531 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1838,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     assert final_stats.get("START_REPLICATION", 0) >= 1
     # walproposer should connect to each safekeeper at least once
     assert final_stats.get("START_WAL_PUSH", 0) >= 3
+
+
+@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000])
+def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
+    target_percents = [10, 50, 90, 100]
+
+    neon_env_builder.num_safekeepers = 3
+    # we need remote storage that supports copy_object S3 API
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+
+    lsns = []
+
+    def remember_lsn():
+        lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        lsns.append(lsn)
+        return lsn
+
+    # remember LSN right after timeline creation
+    lsn = remember_lsn()
+    log.info(f"LSN after timeline creation: {lsn}")
+
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = timeline_status.timeline_start_lsn
+    log.info(f"Timeline start LSN: {timeline_start_lsn}")
+
+    current_percent = 0.0
+    for new_percent in target_percents:
+        new_rows = insert_rows * (new_percent - current_percent) / 100
+        current_percent = new_percent
+
+        if new_rows == 0:
+            continue
+
+        endpoint.safe_psql(
+            f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)"
+        )
+
+        # remember LSN right after reaching new_percent
+        lsn = remember_lsn()
+        log.info(f"LSN after inserting {new_rows} rows: {lsn}")
+
+    # TODO: would be also good to test cases where not all segments are uploaded to S3
+
+    for lsn in lsns:
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}")
+
+        orig_digest = (
+            env.safekeepers[0]
+            .http_client()
+            .timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn)
+        )
+        log.info(f"Original digest: {orig_digest}")
+
+        for sk in env.safekeepers:
+            sk.http_client().copy_timeline(
+                tenant_id,
+                timeline_id,
+                {
+                    "target_timeline_id": str(new_timeline_id),
+                    "until_lsn": str(lsn),
+                },
+            )
+
+            new_digest = sk.http_client().timeline_digest(
+                tenant_id, new_timeline_id, timeline_start_lsn, lsn
+            )
+            log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}")
+
+            assert orig_digest == new_digest
+
+    # TODO: test timelines can start after copy

From d260426a14606eceb5c7da447ccb089214bd0902 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 5 Jan 2024 11:48:06 +0100
Subject: [PATCH 13/21] is_rel_block_key: exclude the relsize key (#6266)

Before this PR, `is_rel_block_key` returns true for the blknum
`0xffffffff`,
which is a blknum that's actually never written by Postgres, but used by
Neon Pageserver to store the relsize.

Quoting @MMeent:

> PostgreSQL can't extend the relation beyond size of 0xFFFFFFFF blocks,
> so block number 0xFFFFFFFE is the last valid block number.

This PR changes the definition of the function to exclude blknum
0xffffffff.

My motivation for doing this change is to fix the `pagebench` getpage
benchmark, which uses `is_rel_block_key` to filter the keyspace for
valid pages to request from page_service.
fixes https://github.com/neondatabase/neon/issues/6210

I checked other users of the function.

The first one is `key_is_shard0`, which already had added an exemption
for 0xffffffff. So, there's no functional change with this PR.

The second one is `DatadirModification::flush`[^1]. With this PR,
`.flush()` will skip the relsize key, whereas it didn't
before. This means we will pile up all the relsize key-value pairs
`(Key,u32)`
in `DatadirModification::pending_updates` until `.commit()` is called.

The only place I can think of where that would be a problem is if we
import from a full basebackup, and don't `.commit()` regularly,
like we currently don't do in `import_basebackup_from_tar`.
It exposes us to input-controlled allocations.
However, that was already the case for the other keys that are skipped,
so, one can argue that this change is not making the situation much
worse.

[^1]: That type's `flush()` and `commit()` methods are terribly named,
but,
      that's for another time
---
 libs/pageserver_api/src/key.rs   | 2 +-
 libs/pageserver_api/src/shard.rs | 7 +------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index d680a5600e..3e1bba2a06 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -142,7 +142,7 @@ impl Key {
 }
 
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }
 
 impl std::str::FromStr for Key {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a186d93bce..18ef2be523 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -530,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+    !is_rel_block_key(key)
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name

From 3c560d27a826ba54181208ec56bf5fbaab74844d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 5 Jan 2024 12:29:20 +0000
Subject: [PATCH 14/21] pageserver: implement secondary-mode downloads (#6123)

Follows on from #6050 , in which we upload heatmaps. Secondary locations
will now poll those heatmaps and download layers mentioned in the
heatmap.

TODO:
- [X] ~Unify/reconcile stats for behind-schedule execution with
warn_when_period_overrun
(https://github.com/neondatabase/neon/pull/6050#discussion_r1426560695)~
- [x] Give downloads their own concurrency config independent of uploads

Deferred optimizations:
- https://github.com/neondatabase/neon/issues/6199
- https://github.com/neondatabase/neon/issues/6200

Eviction will be the next PR:
- #5342
---
 control_plane/src/pageserver.rs               |   7 +
 control_plane/src/tenant_migration.rs         |  21 +-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/sync/gate.rs                   |   6 +
 libs/utils/src/yielding_loop.rs               |  35 +
 pageserver/client/src/mgmt_api.rs             |  14 +-
 pageserver/src/config.rs                      |  30 +-
 pageserver/src/http/routes.rs                 |  35 +
 pageserver/src/lib.rs                         |   4 +
 pageserver/src/metrics.rs                     |  12 +
 pageserver/src/task_mgr.rs                    |   3 +
 pageserver/src/tenant/delete.rs               |   2 +-
 pageserver/src/tenant/mgr.rs                  | 253 ++++--
 .../src/tenant/remote_timeline_client.rs      |   9 +-
 pageserver/src/tenant/secondary.rs            | 142 +++-
 pageserver/src/tenant/secondary/downloader.rs | 801 ++++++++++++++++++
 .../src/tenant/secondary/heatmap_uploader.rs  | 502 ++++-------
 pageserver/src/tenant/secondary/scheduler.rs  | 361 ++++++++
 pageserver/src/tenant/tasks.rs                |   2 +
 test_runner/fixtures/pageserver/http.py       |   4 +
 .../regress/test_pageserver_secondary.py      | 149 +++-
 21 files changed, 1975 insertions(+), 419 deletions(-)
 create mode 100644 libs/utils/src/yielding_loop.rs
 create mode 100644 pageserver/src/tenant/secondary/downloader.rs
 create mode 100644 pageserver/src/tenant/secondary/scheduler.rs

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d490016bf..fb0d251722 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -485,6 +485,13 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_id).await?)
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_id: TenantId,
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index 79df108896..23ea8f4060 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,6 +11,7 @@ use crate::{
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -40,9 +41,9 @@ async fn await_lsn(
     loop {
         let latest = match get_lsns(tenant_id, pageserver).await {
             Ok(l) => l,
-            Err(e) => {
+            Err(_e) => {
                 println!(
-                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    "🕑 Waiting for pageserver {} to activate...",
                     pageserver.conf.id
                 );
                 std::thread::sleep(Duration::from_millis(500));
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
     tenant_id: TenantId,
     dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    // Get a new generation
+    println!("🤔 Checking existing status...");
     let attachment_service = AttachmentService::from_env(env);
 
     fn build_location_config(
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
         baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
     }
 
+    println!(
+        "🔁 Downloading latest layers to destination pageserver {}",
+        dest_ps.conf.id
+    );
+    match dest_ps
+        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
+            println!("  (skipping, destination wasn't in secondary mode)")
+        }
+    }
+
     let gen = attachment_service
         .attach_hook(tenant_id, dest_ps.conf.id)
         .await?;
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9e9b0adfe5..890061dc59 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -85,6 +85,8 @@ pub mod sync;
 
 pub mod failpoint_support;
 
+pub mod yielding_loop;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 31c76d2f74..abc3842da8 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,6 +15,12 @@ pub struct Gate {
     name: String,
 }
 
+impl std::fmt::Debug for Gate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Gate<{}>", self.name)
+    }
+}
+
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
new file mode 100644
index 0000000000..963279eb4c
--- /dev/null
+++ b/libs/utils/src/yielding_loop.rs
@@ -0,0 +1,35 @@
+use tokio_util::sync::CancellationToken;
+
+#[derive(thiserror::Error, Debug)]
+pub enum YieldingLoopError {
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
+#[inline(always)]
+pub async fn yielding_loop<I, T, F>(
+    interval: usize,
+    cancel: &CancellationToken,
+    iter: I,
+    mut visitor: F,
+) -> Result<(), YieldingLoopError>
+where
+    I: Iterator<Item = T>,
+    F: FnMut(T),
+{
+    for (i, item) in iter.enumerate() {
+        visitor(item);
+
+        if i + 1 % interval == 0 {
+            tokio::task::yield_now().await;
+            if cancel.is_cancelled() {
+                return Err(YieldingLoopError::Cancelled);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 87e4ed8efd..4c285293f7 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::*;
+use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method};
 use utils::{
     http::error::HttpErrorBody,
@@ -164,6 +164,18 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/secondary/download",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, ())
+            .await?
+            .error_for_status()
+            .map(|_| ())
+            .map_err(|e| Error::ApiError(format!("{}", e)))
+    }
+
     pub async fn location_config(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4560f5eca0..7c03dc1bdd 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,6 +75,7 @@ pub mod defaults {
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
     pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
@@ -130,6 +131,7 @@ pub mod defaults {
 #gc_feedback = false
 
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
 
 [remote_storage]
 
@@ -239,6 +241,10 @@ pub struct PageServerConf {
     /// heatmap uploads vs. other remote storage operations.
     pub heatmap_upload_concurrency: usize,
 
+    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
+    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
+    pub secondary_download_concurrency: usize,
+
     /// Maximum number of WAL records to be ingested and committed at the same time
     pub ingest_batch_size: u64,
 }
@@ -322,6 +328,7 @@ struct PageServerConfigBuilder {
     control_plane_emergency_mode: BuilderValue<bool>,
 
     heatmap_upload_concurrency: BuilderValue<usize>,
+    secondary_download_concurrency: BuilderValue<usize>,
 
     ingest_batch_size: BuilderValue<u64>,
 }
@@ -396,6 +403,7 @@ impl Default for PageServerConfigBuilder {
             control_plane_emergency_mode: Set(false),
 
             heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
 
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
         }
@@ -546,6 +554,10 @@ impl PageServerConfigBuilder {
         self.heatmap_upload_concurrency = BuilderValue::Set(value)
     }
 
+    pub fn secondary_download_concurrency(&mut self, value: usize) {
+        self.secondary_download_concurrency = BuilderValue::Set(value)
+    }
+
     pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
         self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
     }
@@ -651,6 +663,9 @@ impl PageServerConfigBuilder {
             heatmap_upload_concurrency: self
                 .heatmap_upload_concurrency
                 .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
             ingest_batch_size: self
                 .ingest_batch_size
                 .ok_or(anyhow!("missing ingest_batch_size"))?,
@@ -711,6 +726,11 @@ impl PageServerConf {
             .join(TENANT_LOCATION_CONFIG_NAME)
     }
 
+    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TENANT_HEATMAP_BASENAME)
+    }
+
     pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TIMELINES_SEGMENT_NAME)
@@ -896,6 +916,9 @@ impl PageServerConf {
                 "heatmap_upload_concurrency" => {
                     builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                 },
+                "secondary_download_concurrency" => {
+                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
+                },
                 "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
@@ -968,6 +991,7 @@ impl PageServerConf {
             control_plane_api_token: None,
             control_plane_emergency_mode: false,
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
         }
     }
@@ -1198,6 +1222,7 @@ background_task_maximum_delay = '334 s'
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             },
             "Correct defaults should be used when no config values are provided"
@@ -1260,6 +1285,7 @@ background_task_maximum_delay = '334 s'
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
             },
             "Should be able to parse all basic config values correctly"
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8265627cb5..5c7747d353 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1274,6 +1274,23 @@ async fn put_tenant_location_config_handler(
         // which is not a 400 but a 409.
         .map_err(ApiError::BadRequest)?;
 
+    if let Some(_flush_ms) = flush {
+        match state
+            .secondary_controller
+            .upload_tenant(tenant_shard_id)
+            .await
+        {
+            Ok(()) => {
+                tracing::info!("Uploaded heatmap during flush");
+            }
+            Err(e) => {
+                tracing::warn!("Failed to flush heatmap: {e}");
+            }
+        }
+    } else {
+        tracing::info!("No flush requested when configuring");
+    }
+
     json_response(StatusCode::OK, ())
 }
 
@@ -1611,6 +1628,21 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1879,6 +1911,9 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
+            api_handler(r, secondary_download_handler)
+        })
         .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c1ce0af47b..26070e0cc1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
+/// Per-tenant copy of their remote heatmap, downloaded into the local
+/// tenant path while in secondary mode.
+pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4725903783..c86adcfa3d 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1369,6 +1369,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
     pub(crate) upload_heatmap_duration: Histogram,
+    pub(crate) download_heatmap: IntCounter,
+    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
@@ -1386,6 +1388,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Time to build and upload a heatmap, including any waiting inside the S3 client"
     )
     .expect("failed to define a metric"),
+    download_heatmap: register_int_counter!(
+        "pageserver_secondary_download_heatmap",
+        "Number of downloads of heatmaps by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
+    download_layer: register_int_counter!(
+        "pageserver_secondary_download_layer",
+        "Number of downloads of layers by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index eabb1d0022..5a06a97525 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -258,6 +258,9 @@ pub enum TaskKind {
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
 
+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
     /// See [`crate::tenant::secondary`].
     SecondaryUploads,
 
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b21bad51ba..2f606ed822 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
                             }
                             break;
                         }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
                             // This is unexpected: this secondary tenants should not have been created, and we
                             // are not in a position to shut it down from here.
                             tracing::warn!("Tenant transitioned to secondary mode while deleting!");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 250de7247d..70b41b7b1f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
+use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -57,7 +58,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
     Attached(Arc<Tenant>),
-    Secondary,
+    Secondary(Arc<SecondaryTenant>),
     /// In this state, other administrative operations acting on the TenantId should
     /// block, or return a retry indicator equivalent to HTTP 503.
     InProgress(utils::completion::Barrier),
@@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary => write!(f, "Secondary"),
+            Self::Secondary(_) => write!(f, "Secondary"),
             Self::InProgress(_) => write!(f, "InProgress"),
         }
     }
@@ -78,7 +79,7 @@ impl TenantSlot {
     fn get_attached(&self) -> Option<&Arc<Tenant>> {
         match self {
             Self::Attached(t) => Some(t),
-            Self::Secondary => None,
+            Self::Secondary(_) => None,
             Self::InProgress(_) => None,
         }
     }
@@ -466,12 +467,18 @@ pub async fn init_tenant_mgr(
                 *gen
             } else {
                 match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
+                    LocationMode::Secondary(secondary_config) => {
                         // We do not require the control plane's permission for secondary mode
                         // tenants, because they do no remote writes and hence require no
                         // generation number
                         info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                secondary_config,
+                            )),
+                        );
                     }
                     LocationMode::Attached(_) => {
                         // TODO: augment re-attach API to enable the control plane to
@@ -663,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
 
                             total_attached += 1;
                         }
-                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
                         }
                         TenantSlot::InProgress(notify) => {
                             // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -847,12 +860,28 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
             }
-            None | Some(TenantSlot::Secondary) => {
+            None | Some(TenantSlot::Secondary(_)) => {
                 Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
             }
         }
     }
 
+    pub(crate) fn get_secondary_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Option<Arc<SecondaryTenant>> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        match peek_slot {
+            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
+            _ => None,
+        }
+    }
+
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
@@ -864,10 +893,15 @@ impl TenantManager {
         debug_assert_current_span_has_tenant_id();
         info!("configuring tenant location to state {new_location_config:?}");
 
-        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        enum FastPathModified {
+            Attached(Arc<Tenant>),
+            Secondary(Arc<SecondaryTenant>),
+        }
+
+        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
         // then we do not need to set the slot to InProgress, we can just call into the
         // existng tenant.
-        let modify_tenant = {
+        let fast_path_taken = {
             let locked = self.tenants.read().unwrap();
             let peek_slot =
                 tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -881,12 +915,19 @@ impl TenantManager {
                             new_location_config.clone(),
                         )?);
 
-                        Some(tenant.clone())
+                        Some(FastPathModified::Attached(tenant.clone()))
                     } else {
                         // Different generations, fall through to general case
                         None
                     }
                 }
+                (
+                    LocationMode::Secondary(secondary_conf),
+                    Some(TenantSlot::Secondary(secondary_tenant)),
+                ) => {
+                    secondary_tenant.set_config(secondary_conf);
+                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
+                }
                 _ => {
                     // Not an Attached->Attached transition, fall through to general case
                     None
@@ -895,34 +936,51 @@ impl TenantManager {
         };
 
         // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of waiting for flush, before returning.
-        if let Some(tenant) = modify_tenant {
-            // Transition to AttachedStale means we may well hold a valid generation
-            // still, and have been requested to go stale as part of a migration.  If
-            // the caller set `flush`, then flush to remote storage.
-            if let LocationMode::Attached(AttachedLocationConfig {
-                generation: _,
-                attach_mode: AttachmentMode::Stale,
-            }) = &new_location_config.mode
-            {
-                if let Some(flush_timeout) = flush {
-                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                        Ok(Err(e)) => {
-                            return Err(e);
-                        }
-                        Ok(Ok(_)) => return Ok(()),
-                        Err(_) => {
-                            tracing::warn!(
+        // phase of writing config and/or waiting for flush, before returning.
+        match fast_path_taken {
+            Some(FastPathModified::Attached(tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                // Transition to AttachedStale means we may well hold a valid generation
+                // still, and have been requested to go stale as part of a migration.  If
+                // the caller set `flush`, then flush to remote storage.
+                if let LocationMode::Attached(AttachedLocationConfig {
+                    generation: _,
+                    attach_mode: AttachmentMode::Stale,
+                }) = &new_location_config.mode
+                {
+                    if let Some(flush_timeout) = flush {
+                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                            Ok(Err(e)) => {
+                                return Err(e);
+                            }
+                            Ok(Ok(_)) => return Ok(()),
+                            Err(_) => {
+                                tracing::warn!(
                                 timeout_ms = flush_timeout.as_millis(),
                                 "Timed out waiting for flush to remote storage, proceeding anyway."
                             )
+                            }
                         }
                     }
                 }
-            }
 
-            return Ok(());
-        }
+                return Ok(());
+            }
+            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                return Ok(());
+            }
+            None => {
+                // Proceed with the general case procedure, where we will shutdown & remove any existing
+                // slot contents and replace with a fresh one
+            }
+        };
 
         // General case for upserts to TenantsMap, excluding the case above: we will substitute an
         // InProgress value to the slot while we make whatever changes are required.  The state for
@@ -931,33 +989,47 @@ impl TenantManager {
         // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
 
-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
+        match slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(tenant)) => {
+                // The case where we keep a Tenant alive was covered above in the special case
+                // for Attached->Attached transitions in the same generation.  By this point,
+                // if we see an attached tenant we know it will be discarded and should be
+                // shut down.
+                let (_guard, progress) = utils::completion::channel();
 
-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
-                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
+                match tenant.get_attach_mode() {
+                    AttachmentMode::Single | AttachmentMode::Multi => {
+                        // Before we leave our state as the presumed holder of the latest generation,
+                        // flush any outstanding deletions to reduce the risk of leaking objects.
+                        self.resources.deletion_queue_client.flush_advisory()
+                    }
+                    AttachmentMode::Stale => {
+                        // If we're stale there's not point trying to flush deletions
+                    }
+                };
 
-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
+                info!("Shutting down attached tenant");
+                match tenant.shutdown(progress, false).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
                 }
+                slot_guard.drop_old_value().expect("We just shut it down");
+            }
+            Some(TenantSlot::Secondary(state)) => {
+                info!("Shutting down secondary tenant");
+                state.shutdown().await;
+            }
+            Some(TenantSlot::InProgress(_)) => {
+                // This should never happen: acquire_slot should error out
+                // if the contents of a slot were InProgress.
+                anyhow::bail!("Acquired an InProgress slot, this is a bug.")
+            }
+            None => {
+                // Slot was vacant, nothing needs shutting down.
             }
-            slot_guard.drop_old_value().expect("We just shut it down");
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -980,7 +1052,9 @@ impl TenantManager {
             .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => TenantSlot::Secondary,
+            LocationMode::Secondary(secondary_config) => {
+                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+            }
             LocationMode::Attached(_attach_config) => {
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
@@ -1093,6 +1167,30 @@ impl TenantManager {
                 .collect(),
         }
     }
+    // Do some synchronous work for all tenant slots in Secondary state.  The provided
+    // callback should be small and fast, as it will be called inside the global
+    // TenantsMap lock.
+    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
+    where
+        // TODO: let the callback return a hint to drop out of the loop early
+        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
+    {
+        let locked = self.tenants.read().unwrap();
+
+        let map = match &*locked {
+            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+            TenantsMap::Open(m) => m,
+        };
+
+        for (tenant_id, slot) in map {
+            if let TenantSlot::Secondary(state) = slot {
+                // Only expose secondary tenants that are not currently shutting down
+                if !state.cancel.is_cancelled() {
+                    func(tenant_id, state)
+                }
+            }
+        }
+    }
 
     pub(crate) async fn delete_tenant(
         &self,
@@ -1207,7 +1305,7 @@ pub(crate) fn get_tenant(
         Some(TenantSlot::InProgress(_)) => {
             Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
         }
-        None | Some(TenantSlot::Secondary) => {
+        None | Some(TenantSlot::Secondary(_)) => {
             Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
         }
     }
@@ -1280,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                     }
                 }
             }
-            Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::Secondary(_)) => {
                 return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                     tenant_id,
                 )))
@@ -1544,7 +1642,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
     Ok(m.iter()
         .filter_map(|(id, tenant)| match tenant {
             TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
+            TenantSlot::Secondary(_) => None,
             TenantSlot::InProgress(_) => None,
         })
         .collect())
@@ -1801,11 +1899,7 @@ impl SlotGuard {
     fn old_value_is_shutdown(&self) -> bool {
         match self.old_value.as_ref() {
             Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary) => {
-                // TODO: when adding secondary mode tenants, this will check for shutdown
-                // in the same way that we do for `Tenant` above
-                true
-            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
             Some(TenantSlot::InProgress(_)) => {
                 // A SlotGuard cannot be constructed for a slot that was already InProgress
                 unreachable!()
@@ -2015,26 +2109,19 @@ where
     let mut slot_guard =
         tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
 
-    // The SlotGuard allows us to manipulate the Tenant object without fear of some
-    // concurrent API request doing something else for the same tenant ID.
-    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(t)) => Some(t),
-        _ => None,
-    };
-
     // allow pageserver shutdown to await for our completion
     let (_guard, progress) = completion::channel();
 
-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match &attached_tenant {
-        Some(attached_tenant) => {
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
             // whenever we remove a tenant from memory, we don't want to flush and wait for upload
             let freeze_and_flush = false;
 
             // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
             // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                 Ok(()) => {}
                 Err(_other) => {
                     // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2043,11 +2130,19 @@ where
                     return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                 }
             }
+            Some(tenant)
         }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
         }
-    }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };
 
     match tenant_cleanup
         .await
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 60b40d70a7..2ea3ced008 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -229,6 +229,7 @@ use crate::{
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
     },
+    TENANT_HEATMAP_BASENAME,
 };
 
 use utils::id::{TenantId, TimelineId};
@@ -1741,11 +1742,11 @@ pub fn remote_index_path(
     .expect("Failed to construct path")
 }
 
-pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
-        .expect("Failed to construct path")
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
+    ))
+    .expect("Failed to construct path")
 }
 
 /// Given the key of an index, parse out the generation part of the name
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index d25fe56b92..2331447266 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,24 +1,48 @@
+mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
+mod scheduler;
 
 use std::sync::Arc;
 
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 
-use self::heatmap_uploader::heatmap_uploader_task;
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_uploader::heatmap_uploader_task,
+};
 
-use super::mgr::TenantManager;
+use super::{config::SecondaryLocationConfig, mgr::TenantManager};
 
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
+use utils::{completion::Barrier, sync::gate::Gate};
 
+enum DownloadCommand {
+    Download(TenantShardId),
+}
 enum UploadCommand {
     Upload(TenantShardId),
 }
 
+impl UploadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Upload(id) => id,
+        }
+    }
+}
+
+impl DownloadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Download(id) => id,
+        }
+    }
+}
+
 struct CommandRequest<T> {
     payload: T,
     response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +52,73 @@ struct CommandResponse {
     result: anyhow::Result<()>,
 }
 
+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Carrying a tenant shard ID simplifies callers such as the downloader
+    /// which need to organize many of these objects by ID.
+    tenant_shard_id: TenantShardId,
+
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    pub(crate) gate: Gate,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        config: &SecondaryLocationConfig,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_shard_id,
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+
+            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete
+        self.gate.close().await;
+    }
+
+    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
+        self.detail.lock().unwrap().config = config.clone();
+    }
+
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }
 
 impl SecondaryController {
@@ -63,6 +148,13 @@ impl SecondaryController {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
+    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(
+            &self.download_req_tx,
+            DownloadCommand::Download(tenant_shard_id),
+        )
+        .await
+    }
 }
 
 pub fn spawn_tasks(
@@ -71,9 +163,37 @@ pub fn spawn_tasks(
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
 ) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await;
+
+            Ok(())
+        },
+    );
+
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryUploads,
@@ -89,16 +209,26 @@ pub fn spawn_tasks(
                 background_jobs_can_start,
                 cancel,
             )
-            .await
+            .await;
+
+            Ok(())
         },
     );
 
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
 }
 
 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, _upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
new file mode 100644
index 0000000000..6fdee08a4e
--- /dev/null
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,801 @@
+use std::{
+    collections::{HashMap, HashSet},
+    pin::Pin,
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::SecondaryLocationConfig,
+        debug_assert_current_span_has_tenant_and_timeline_id,
+        remote_timeline_client::{
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+        },
+        span::debug_assert_current_span_has_tenant_id,
+        storage_layer::LayerFileName,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+    },
+    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+};
+
+use super::{
+    heatmap::HeatMapLayer,
+    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    SecondaryTenant,
+};
+
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use futures::Future;
+use pageserver_api::shard::TenantShardId;
+use rand::Rng;
+use remote_storage::{DownloadError, GenericRemoteStorage};
+
+use tokio_util::sync::CancellationToken;
+use tracing::{info_span, instrument, Instrument};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+pub(super) async fn downloader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+
+    let generator = SecondaryDownloader {
+        tenant_manager,
+        remote_storage,
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("secondary_downloads"))
+        .await
+}
+
+struct SecondaryDownloader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+}
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    metadata: LayerFileMetadata,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        _conf: &'static PageServerConf,
+        _tenant_shard_id: &TenantShardId,
+        _imeline_id: &TimelineId,
+        _ame: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            metadata,
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Debug)]
+pub(super) struct SecondaryDetail {
+    pub(super) config: SecondaryLocationConfig,
+
+    last_download: Option<Instant>,
+    next_download: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
+        Self {
+            config,
+            last_download: None,
+            next_download: None,
+            timelines: HashMap::new(),
+        }
+    }
+}
+
+struct PendingDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    last_download: Option<Instant>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for PendingDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+struct RunningDownload {
+    barrier: Barrier,
+}
+
+impl scheduler::RunningJob for RunningDownload {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
+struct CompleteDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    completed_at: Instant,
+}
+
+impl scheduler::Completion for CompleteDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+type Scheduler = TenantBackgroundJobs<
+    SecondaryDownloader,
+    PendingDownload,
+    RunningDownload,
+    CompleteDownload,
+    DownloadCommand,
+>;
+
+#[async_trait::async_trait]
+impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
+    for SecondaryDownloader
+{
+    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
+    fn on_completion(&mut self, completion: CompleteDownload) {
+        let CompleteDownload {
+            secondary_state,
+            completed_at: _completed_at,
+        } = completion;
+
+        tracing::debug!("Secondary tenant download completed");
+
+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
+        let mut detail = secondary_state.detail.lock().unwrap();
+        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+    }
+
+    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
+
+        // Step 1: identify some tenants that we may work on
+        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|_id, secondary_state| {
+                tenants.push(secondary_state.clone());
+            });
+
+        // Step 2: filter out tenants which are not yet elegible to run
+        let now = Instant::now();
+        result.jobs = tenants
+            .into_iter()
+            .filter_map(|secondary_tenant| {
+                let (last_download, next_download) = {
+                    let mut detail = secondary_tenant.detail.lock().unwrap();
+
+                    if !detail.config.warm {
+                        // Downloads are disabled for this tenant
+                        detail.next_download = None;
+                        return None;
+                    }
+
+                    if detail.next_download.is_none() {
+                        // Initialize with a jitter: this spreads initial downloads on startup
+                        // or mass-attach across our freshen interval.
+                        let jittered_period =
+                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
+                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        "Using our constant, which is known to be small compared with clock range",
+                    ));
+                    }
+                    (detail.last_download, detail.next_download.unwrap())
+                };
+
+                if now < next_download {
+                    Some(PendingDownload {
+                        secondary_state: secondary_tenant,
+                        last_download,
+                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Step 3: sort by target execution time to run most urgent first.
+        result.jobs.sort_by_key(|j| j.target_time);
+
+        result
+    }
+
+    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        let tenant = self
+            .tenant_manager
+            .get_secondary_tenant_shard(*tenant_shard_id);
+        let Some(tenant) = tenant else {
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
+        };
+
+        Ok(PendingDownload {
+            target_time: None,
+            period: None,
+            last_download: None,
+            secondary_state: tenant,
+        })
+    }
+
+    fn spawn(
+        &mut self,
+        job: PendingDownload,
+    ) -> (
+        RunningDownload,
+        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
+    ) {
+        let PendingDownload {
+            secondary_state,
+            last_download,
+            target_time,
+            period,
+        } = job;
+
+        let (completion, barrier) = utils::completion::channel();
+        let remote_storage = self.remote_storage.clone();
+        let conf = self.tenant_manager.get_conf();
+        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        (RunningDownload { barrier }, Box::pin(async move {
+            let _completion = completion;
+
+            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+                .download()
+                .await
+            {
+                Err(UpdateError::NoData) => {
+                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
+                },
+                Err(UpdateError::NoSpace) => {
+                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
+                }
+                Err(UpdateError::Cancelled) => {
+                    tracing::debug!("Shut down while downloading");
+                },
+                Err(UpdateError::Deserialize(e)) => {
+                    tracing::error!("Corrupt content while downloading tenant: {e}");
+                },
+                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
+                    tracing::error!("Error while downloading tenant: {e}");
+                },
+                Ok(()) => {}
+            };
+
+            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);
+
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
+            }
+
+            CompleteDownload {
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
+        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
+}
+
+/// This type is a convenience to group together the various functions involved in
+/// freshening a secondary tenant.
+struct TenantDownloader<'a> {
+    conf: &'static PageServerConf,
+    remote_storage: &'a GenericRemoteStorage,
+    secondary_state: &'a SecondaryTenant,
+}
+
+/// Errors that may be encountered while updating a tenant
+#[derive(thiserror::Error, Debug)]
+enum UpdateError {
+    #[error("No remote data found")]
+    NoData,
+    #[error("Insufficient local storage space")]
+    NoSpace,
+    #[error("Failed to download")]
+    DownloadError(DownloadError),
+    #[error(transparent)]
+    Deserialize(#[from] serde_json::Error),
+    #[error("Cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<DownloadError> for UpdateError {
+    fn from(value: DownloadError) -> Self {
+        match &value {
+            DownloadError::Cancelled => Self::Cancelled,
+            DownloadError::NotFound => Self::NoData,
+            _ => Self::DownloadError(value),
+        }
+    }
+}
+
+impl From<std::io::Error> for UpdateError {
+    fn from(value: std::io::Error) -> Self {
+        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+            UpdateError::NoSpace
+        } else {
+            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            UpdateError::Other(anyhow::anyhow!(value))
+        }
+    }
+}
+
+impl<'a> TenantDownloader<'a> {
+    fn new(
+        conf: &'static PageServerConf,
+        remote_storage: &'a GenericRemoteStorage,
+        secondary_state: &'a SecondaryTenant,
+    ) -> Self {
+        Self {
+            conf,
+            remote_storage,
+            secondary_state,
+        }
+    }
+
+    async fn download(&self) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+
+        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
+        // cover our access to local storage.
+        let Ok(_guard) = self.secondary_state.gate.enter() else {
+            // Shutting down
+            return Ok(());
+        };
+
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // Download the tenant's heatmap
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
+            _ = self.secondary_state.cancel.cancelled() => return Ok(())
+        );
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
+
+        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
+        let heatmap_path_bg = heatmap_path.clone();
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
+
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            let timeline_id = timeline.timeline_id;
+            self.download_timeline(timeline)
+                .instrument(tracing::info_span!(
+                    "secondary_download_timeline",
+                    tenant_id=%tenant_shard_id.tenant_id,
+                    shard_id=%tenant_shard_id.shard_slug(),
+                    %timeline_id
+                ))
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: make download conditional on ETag having changed since last download
+        // (https://github.com/neondatabase/neon/issues/6199)
+        tracing::debug!("Downloading heatmap for secondary tenant",);
+
+        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+
+        let heatmap_bytes = backoff::retry(
+            || async {
+                let download = self
+                    .remote_storage
+                    .download(&heatmap_path)
+                    .await
+                    .map_err(UpdateError::from)?;
+                let mut heatmap_bytes = Vec::new();
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                Ok(heatmap_bytes)
+            },
+            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "download heatmap",
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
+        )
+        .await?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
+    }
+
+    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_path = self
+            .conf
+            .timeline_path(tenant_shard_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                tracing::debug!("Layer {} is already on disk", layer.name);
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::debug!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            } else {
+                tracing::debug!("Layer {} not present on disk yet", layer.name);
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+            let downloaded_bytes = match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+            )
+            .await
+            {
+                Ok(bytes) => bytes,
+                Err(e) => {
+                    if let DownloadError::NotFound = e {
+                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                        // This is harmless: continue to download the next layer. It is expected during compaction
+                        // GC.
+                        tracing::debug!(
+                            "Skipped downloading missing layer {}, raced with compaction/gc?",
+                            layer.name
+                        );
+                        continue;
+                    } else {
+                        return Err(e.into());
+                    }
+                }
+            };
+
+            if downloaded_bytes != layer.metadata.file_size {
+                let local_path = timeline_path.join(layer.name.to_string());
+
+                tracing::warn!(
+                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                    layer.name,
+                    downloaded_bytes,
+                    layer.metadata.file_size
+                );
+
+                tokio::fs::remove_file(&local_path)
+                    .await
+                    .or_else(fs_ext::ignore_not_found)?;
+            }
+
+            SECONDARY_MODE.download_layer.inc();
+            touched.push(layer)
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
+async fn init_timeline_state(
+    conf: &'static PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    heatmap: &HeatMapTimeline,
+) -> SecondaryDetailTimeline {
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::default();
+
+    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+        Ok(d) => d,
+        Err(e) => {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                let context = format!("Creating timeline directory {timeline_path}");
+                tracing::info!("{}", context);
+                tokio::fs::create_dir_all(&timeline_path)
+                    .await
+                    .fatal_err(&context);
+
+                // No entries to report: drop out.
+                return detail;
+            } else {
+                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
+            }
+        }
+    };
+
+    // As we iterate through layers found on disk, we will look up their metadata from this map.
+    // Layers not present in metadata will be discarded.
+    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+    while let Some(dentry) = dir
+        .next_entry()
+        .await
+        .fatal_err(&format!("Listing {timeline_path}"))
+    {
+        let dentry_file_name = dentry.file_name();
+        let file_name = dentry_file_name.to_string_lossy();
+        let local_meta = dentry.metadata().await.fatal_err(&format!(
+            "Read metadata on {}",
+            dentry.path().to_string_lossy()
+        ));
+
+        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        if file_name == METADATA_FILE_NAME {
+            continue;
+        }
+
+        match LayerFileName::from_str(&file_name) {
+            Ok(name) => {
+                let remote_meta = heatmap_metadata.get(&name);
+                match remote_meta {
+                    Some(remote_meta) => {
+                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
+                                "Removing local layer {name} with unexpected local size {} != {}",
+                                local_meta.len(),
+                                remote_meta.metadata.file_size
+                            );
+                        } else {
+                            // We expect the access time to be initialized immediately afterwards, when
+                            // the latest heatmap is applied to the state.
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.access_time,
+                                ),
+                            );
+                        }
+                    }
+                    None => {
+                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                        tracing::info!(
+                            "Removing secondary local layer {} because it's absent in heatmap",
+                            name
+                        );
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
+                    }
+                }
+            }
+            Err(_) => {
+                // Ignore it.
+                tracing::warn!("Unexpected file in timeline directory: {file_name}");
+            }
+        }
+    }
+
+    detail
+}
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index ece2b93ce1..ef01c33e8e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,5 +1,6 @@
 use std::{
     collections::HashMap,
+    pin::Pin,
     sync::{Arc, Weak},
     time::{Duration, Instant},
 };
@@ -7,35 +8,86 @@ use std::{
 use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+        config::AttachmentMode,
+        mgr::TenantManager,
+        remote_timeline_client::remote_heatmap_path,
+        span::debug_assert_current_span_has_tenant_id,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        Tenant,
     },
 };
 
+use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use remote_storage::GenericRemoteStorage;
 
-use tokio::task::JoinSet;
+use super::{
+    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    CommandRequest,
+};
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
-use utils::{backoff, completion::Barrier};
+use tracing::{info_span, instrument, Instrument};
+use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+use super::{heatmap::HeatMapTenant, UploadCommand};
 
-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let generator = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tenants: HashMap::new(),
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("heatmap_uploader"))
+        .await
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+}
 
 struct WriteInProgress {
     barrier: Barrier,
 }
 
+impl RunningJob for WriteInProgress {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
 struct UploadPending {
     tenant: Arc<Tenant>,
     last_digest: Option<md5::Digest>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for UploadPending {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.tenant.get_tenant_shard_id()
+    }
 }
 
 struct WriteComplete {
@@ -45,6 +97,12 @@ struct WriteComplete {
     next_upload: Option<Instant>,
 }
 
+impl scheduler::Completion for WriteComplete {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -68,267 +126,111 @@ struct UploaderTenantState {
     next_upload: Option<Instant>,
 }
 
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
+type Scheduler = TenantBackgroundJobs<
+    HeatmapUploader,
+    UploadPending,
+    WriteInProgress,
+    WriteComplete,
+    UploadCommand,
+>;
 
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
-}
-
-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
-        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+#[async_trait::async_trait]
+impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
+    for HeatmapUploader
+{
+    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
         // Cull any entries in self.tenants whose Arc<Tenant> is gone
         self.tenants
             .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
 
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
         let now = Instant::now();
 
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
 
-        // Iterate over tenants looking for work to do.
         let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
 
-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
+        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
+            let period = match tenant.get_heatmap_period() {
+                None => {
+                    // Heatmaps are disabled for this tenant
+                    return;
+                }
+                Some(period) => {
+                    // If any tenant has asked for uploads more frequent than our scheduling interval,
+                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                    // we may set rather short intervals.
+                    result.want_interval = match result.want_interval {
+                        None => Some(period),
+                        Some(existing) => Some(std::cmp::min(period, existing)),
+                    };
 
-            self.maybe_schedule_upload(&now, tenant);
+                    period
+                }
+            };
 
-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
+            // Stale attachments do not upload anything: if we are in this state, there is probably some
+            // other attachment in mode Single or Multi running on another pageserver, and we don't
+            // want to thrash and overwrite their heatmap uploads.
+            if tenant.get_attach_mode() == AttachmentMode::Stale {
                 return;
             }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
+
+            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+            // with the completion time in on_completion.
+            let state = self
+                .tenants
+                .entry(*tenant.get_tenant_shard_id())
+                .or_insert_with(|| {
+                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
+
+                    UploaderTenantState {
+                        tenant: Arc::downgrade(&tenant),
+                        last_upload: None,
+                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
+                        last_digest: None,
+                    }
+                });
+
+            // Decline to do the upload if insufficient time has passed
+            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
+                return;
             }
-        }
 
-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
+            let last_digest = state.last_digest;
+            result.jobs.push(UploadPending {
+                tenant,
+                last_digest,
+                target_time: state.next_upload,
+                period: Some(period),
             });
+        })
+        .await
+        .ok();
 
-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
+        result
+    }
 
-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
+    fn spawn(
+        &mut self,
+        job: UploadPending,
+    ) -> (
+        WriteInProgress,
+        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
+    ) {
+        let UploadPending {
             tenant,
             last_digest,
-        })
-    }
+            target_time,
+            period,
+        } = job;
 
-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
         let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
         let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        (WriteInProgress { barrier }, Box::pin(async move {
             // Guard for the barrier in [`WriteInProgress`]
             let _completion = completion;
 
@@ -362,22 +264,47 @@ impl HeatmapUploader {
             };
 
             let now = Instant::now();
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = now.duration_since(target_time);
+
+                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
+            }
+
             let next_upload = tenant
                 .get_heatmap_period()
                 .and_then(|period| now.checked_add(period));
 
-            result_tx
-                .send(WriteComplete {
+            WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
                     digest,
                     next_upload,
-                })
-                .ok();
-        });
+                }
+        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
 
-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
+    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        tracing::info!(
+            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+            "Starting heatmap write on command");
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        Ok(UploadPending {
+            // Ignore our state for last digest: this forces an upload even if nothing has changed
+            last_digest: None,
+            tenant,
+            target_time: None,
+            period: None,
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +316,6 @@ impl HeatmapUploader {
             digest,
             next_upload,
         } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
         use std::collections::hash_map::Entry;
         match self.tenants.entry(tenant_shard_id) {
             Entry::Vacant(_) => {
@@ -402,69 +328,6 @@ impl HeatmapUploader {
             }
         }
     }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
 }
 
 enum UploadHeatmapOutcome {
@@ -487,7 +350,6 @@ enum UploadHeatmapError {
 
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
new file mode 100644
index 0000000000..cf01a100d9
--- /dev/null
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -0,0 +1,361 @@
+use async_trait;
+use futures::Future;
+use std::{
+    collections::HashMap,
+    marker::PhantomData,
+    pin::Pin,
+    time::{Duration, Instant},
+};
+
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use utils::{completion::Barrier, yielding_loop::yielding_loop};
+
+use super::{CommandRequest, CommandResponse};
+
+/// Scheduling interval is the time between calls to JobGenerator::schedule.
+/// When we schedule jobs, the job generator may provide a hint of its preferred
+/// interval, which we will respect within these intervals.
+const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Scheduling helper for background work across many tenants.
+///
+/// Systems that need to run background work across many tenants may use this type
+/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
+/// implementation to provide the work to execute.  This is a simple scheduler that just
+/// polls the generator for outstanding work, replacing its queue of pending work with
+/// what the generator yields on each call: the job generator can change its mind about
+/// the order of jobs between calls.  The job generator is notified when jobs complete,
+/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
+/// admin APIs).
+///
+/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
+///
+/// G: A JobGenerator that this scheduler will poll to find pending jobs
+/// PJ: 'Pending Job': type for job descriptors that are ready to run
+/// RJ: 'Running Job' type' for jobs that have been spawned
+/// C : 'Completion' type that spawned jobs will send when they finish
+/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
+pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    G: JobGenerator<PJ, RJ, C, CMD>,
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    generator: G,
+
+    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
+    /// be removed on next scheduling pass.
+    pending: std::collections::VecDeque<PJ>,
+
+    /// Tasks currently running in Self::tasks for these tenants.  Check this map
+    /// before pushing more work into pending for the same tenant.
+    running: HashMap<TenantShardId, RJ>,
+
+    tasks: JoinSet<C>,
+
+    concurrency: usize,
+
+    /// How often we would like schedule_interval to be called.
+    pub(super) scheduling_interval: Duration,
+
+    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
+}
+
+#[async_trait::async_trait]
+pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
+    ///
+    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
+    /// Implementations should take care to yield the executor periodically if running
+    /// very long loops.
+    ///
+    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
+    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
+    /// and re-generated.
+    async fn schedule(&mut self) -> SchedulingResult<PJ>;
+
+    /// Called when a pending job is ready to be run.
+    ///
+    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
+    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
+
+    /// Called when a job previously spawned with spawn() transmits its completion
+    fn on_completion(&mut self, completion: C);
+
+    /// Called when a command is received.  A job will be spawned immediately if the return
+    /// value is Some, ignoring concurrency limits and the pending queue.
+    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+}
+
+/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
+pub(super) struct SchedulingResult<PJ> {
+    pub(super) jobs: Vec<PJ>,
+    /// The job generator would like to be called again this soon
+    pub(super) want_interval: Option<Duration>,
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait PendingJob {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait Completion: Send + 'static {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait RunningJob {
+    fn get_barrier(&self) -> Barrier;
+}
+
+impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+    G: JobGenerator<PJ, RJ, C, CMD>,
+{
+    pub(super) fn new(generator: G, concurrency: usize) -> Self {
+        Self {
+            generator,
+            pending: std::collections::VecDeque::new(),
+            running: HashMap::new(),
+            tasks: JoinSet::new(),
+            concurrency,
+            scheduling_interval: MAX_SCHEDULING_INTERVAL,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub(super) async fn run(
+        &mut self,
+        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
+        background_jobs_can_start: Barrier,
+        cancel: CancellationToken,
+    ) {
+        tracing::info!("Waiting for background_jobs_can start...");
+        background_jobs_can_start.wait().await;
+        tracing::info!("background_jobs_can is ready, proceeding.");
+
+        while !cancel.is_cancelled() {
+            // Look for new work: this is relatively expensive because we have to go acquire the lock on
+            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+            // require an upload.
+            self.schedule_iteration(&cancel).await;
+
+            if cancel.is_cancelled() {
+                return;
+            }
+
+            // Schedule some work, if concurrency limit permits it
+            self.spawn_pending();
+
+            // Between scheduling iterations, we will:
+            //  - Drain any complete tasks and spawn pending tasks
+            //  - Handle incoming administrative commands
+            //  - Check our cancellation token
+            let next_scheduling_iteration = Instant::now()
+                .checked_add(self.scheduling_interval)
+                .unwrap_or_else(|| {
+                    tracing::warn!(
+                        "Scheduling interval invalid ({}s)",
+                        self.scheduling_interval.as_secs_f64()
+                    );
+                    // unwrap(): this constant is small, cannot fail to add to time unless
+                    // we are close to the end of the universe.
+                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
+                });
+            loop {
+                tokio::select! {
+                    _ = cancel.cancelled() => {
+                        tracing::info!("joining tasks");
+                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                        // It is the callers responsibility to make sure that the tasks they scheduled
+                        // respect an appropriate cancellation token, to shut down promptly.  It is only
+                        // safe to wait on joining these tasks because we can see the cancellation token
+                        // has been set.
+                        while let Some(_r) = self.tasks.join_next().await {}
+                        tracing::info!("terminating on cancellation token.");
+
+                        break;
+                    },
+                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                        tracing::debug!("woke for scheduling interval");
+                        break;},
+                    cmd = command_queue.recv() => {
+                        tracing::debug!("woke for command queue");
+                        let cmd = match cmd {
+                            Some(c) =>c,
+                            None => {
+                                // SecondaryController was destroyed, and this has raced with
+                                // our CancellationToken
+                                tracing::info!("terminating on command queue destruction");
+                                cancel.cancel();
+                                break;
+                            }
+                        };
+
+                        let CommandRequest{
+                            response_tx,
+                            payload
+                        } = cmd;
+                        self.handle_command(payload, response_tx);
+                    },
+                    _ = async {
+                        let completion = self.process_next_completion().await;
+                        match completion {
+                            Some(c) => {
+                                self.generator.on_completion(c);
+                                if !cancel.is_cancelled() {
+                                    self.spawn_pending();
+                                }
+                            },
+                            None => {
+                                // Nothing is running, so just wait: expect that this future
+                                // will be dropped when something in the outer select! fires.
+                                cancel.cancelled().await;
+                            }
+                        }
+
+                     } => {}
+                }
+            }
+        }
+    }
+
+    fn do_spawn(&mut self, job: PJ) {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        let (in_progress, fut) = self.generator.spawn(job);
+
+        self.tasks.spawn(fut);
+
+        self.running.insert(tenant_shard_id, in_progress);
+    }
+
+    /// For all pending tenants that are elegible for execution, spawn their task.
+    ///
+    /// Caller provides the spawn operation, we track the resulting execution.
+    fn spawn_pending(&mut self) {
+        while !self.pending.is_empty() && self.running.len() < self.concurrency {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.pending.pop_front().unwrap();
+            self.do_spawn(pending);
+        }
+    }
+
+    /// For administrative commands: skip the pending queue, ignore concurrency limits
+    fn spawn_now(&mut self, job: PJ) -> &RJ {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        self.do_spawn(job);
+        self.running
+            .get(&tenant_shard_id)
+            .expect("We just inserted this")
+    }
+
+    /// Wait until the next task completes, and handle its completion
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) -> Option<C> {
+        match self.tasks.join_next().await {
+            Some(r) => {
+                // We use a channel to drive completions, but also
+                // need to drain the JoinSet to avoid completed tasks
+                // accumulating.  These calls are 1:1 because every task
+                // we spawn into this joinset submits is result to the channel.
+                let completion = r.expect("Panic in background task");
+
+                self.running.remove(completion.get_tenant_shard_id());
+                Some(completion)
+            }
+            None => {
+                // Nothing is running, so we have nothing to wait for.  We may drop out: the
+                // main even loop will call us again after the next time it has run something.
+                None
+            }
+        }
+    }
+
+    /// Convert the command into a pending job, spawn it, and when the spawned
+    /// job completes, send the result down `response_tx`.
+    fn handle_command(
+        &mut self,
+        cmd: CMD,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        let job = match self.generator.on_command(cmd) {
+            Ok(j) => j,
+            Err(e) => {
+                response_tx.send(CommandResponse { result: Err(e) }).ok();
+                return;
+            }
+        };
+
+        let tenant_shard_id = job.get_tenant_shard_id();
+        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            barrier
+        } else {
+            let running = self.spawn_now(job);
+            running.get_barrier().clone()
+        };
+
+        // This task does no I/O: it only listens for a barrier's completion and then
+        // sends to the command response channel.  It is therefore safe to spawn this without
+        // any gates/task_mgr hooks.
+        tokio::task::spawn(async move {
+            barrier.wait().await;
+
+            response_tx.send(CommandResponse { result: Ok(()) }).ok();
+        });
+    }
+
+    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
+        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
+    }
+
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    ///
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    ///
+    /// This function resets the pending list: it is assumed that the caller may change their mind about
+    /// which tenants need work between calls to schedule_iteration.
+    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
+        let SchedulingResult {
+            jobs,
+            want_interval,
+        } = self.generator.schedule().await;
+
+        // Adjust interval based on feedback from the job generator
+        if let Some(want_interval) = want_interval {
+            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
+            self.scheduling_interval = Duration::from_secs(std::cmp::min(
+                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
+                MAX_SCHEDULING_INTERVAL.as_secs(),
+            ));
+        }
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.pending.clear();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
+            // Skip tenants that already have a write in flight
+            if !self.running.contains_key(job.get_tenant_shard_id()) {
+                self.pending.push_back(job);
+            }
+        })
+        .await
+        .ok();
+    }
+}
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 7ff1873eda..aa5894cc37 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
     InitialLogicalSizeCalculation,
+    HeatmapUpload,
+    SecondaryDownload,
 }
 
 impl BackgroundLoopKind {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 6dea0d923d..ccacc0a987 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
+    def tenant_secondary_download(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+        self.verbose_error(res)
+
     def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ae4297983..a9eff99a0c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,9 +1,11 @@
 import random
+from pathlib import Path
 from typing import Any, Dict, Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         flush_ms=5000,
     )
 
+    # Encourage the new location to download while still in secondary mode
+    pageserver_b.http_client().tenant_secondary_download(tenant_id)
+
     migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
     log.info(f"Acquired generation {migrated_generation} for destination pageserver")
     assert migrated_generation == initial_generation + 1
@@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # Writes and reads still work in AttachedStale.
     workload.validate(pageserver_a.id)
 
-    # TODO: call into secondary mode API hooks to do an upload/download sync
-
     # Generate some more dirty writes: we expect the origin to ingest WAL in
     # in AttachedStale
     workload.churn_rows(64, pageserver_a.id, upload=False)
@@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
     validate_heatmap(heatmap_second)
+
+
+def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    """
+    Inspect local storage on a pageserver to discover which layer files are present.
+
+    :return: list of relative paths to layers, from the timeline root.
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+
+    def relative(p: Path) -> Path:
+        return p.relative_to(timeline_path)
+
+    return sorted(
+        list(
+            map(
+                relative,
+                filter(
+                    lambda path: path.name != "metadata"
+                    and "ephemeral" not in path.name
+                    and "temp" not in path.name,
+                    timeline_path.glob("*"),
+                ),
+            )
+        )
+    )
+
+
+def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the overall data flow in secondary mode:
+     - Heatmap uploads from the attached location
+     - Heatmap & layer downloads from the secondary location
+     - Eviction of layers on the attached location results in deletion
+       on the secondary location as well.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_attached = env.pageservers[0]
+    ps_secondary = env.pageservers[1]
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, ps_attached.id)
+
+    # Configure a secondary location
+    log.info("Setting up secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+    readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
+    log.info(f"Read back conf: {readback_conf}")
+
+    # Explicit upload/download cycle
+    # ==============================
+    log.info("Synchronizing after initial write...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Make changes on attached pageserver, check secondary downloads them
+    # ===================================================================
+    log.info("Synchronizing after subsequent write...")
+    workload.churn_rows(128, ps_attached.id)
+
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
+    # walreceiver is still doing something.
+    import time
+
+    time.sleep(5)
+
+    # Do evictions on attached pageserver, check secondary follows along
+    # ==================================================================
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Scrub the remote storage
+    # ========================
+    # This confirms that the scrubber isn't upset by the presence of the heatmap
+    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+
+    # Detach secondary and delete tenant
+    # ===================================
+    # This confirms that the heatmap gets cleaned up as well as other normal content.
+    log.info("Detaching secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+        },
+    )
+
+    log.info("Deleting tenant...")
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )

From 7de829e475471cac50cfbcb80e2792a7658fac9f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 5 Jan 2024 15:35:07 +0000
Subject: [PATCH 15/21] test_runner: replace black with ruff format (#6268)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

`black` is slow sometimes, we can replace it with `ruff format` (a new
feature in 0.1.2 [0]), which produces pretty similar to black style [1].

On my local machine (MacBook M1 Pro 16GB):
```
# `black` on main
$ hyperfine "BLACK_CACHE_DIR=/dev/null poetry run black ."
Benchmark 1: BLACK_CACHE_DIR=/dev/null poetry run black .
  Time (mean ± σ):      3.131 s ±  0.090 s    [User: 5.194 s, System: 0.859 s]
  Range (min … max):    3.047 s …  3.354 s    10 runs
```
```
# `ruff format` on the current PR
$ hyperfine "RUFF_NO_CACHE=true poetry run ruff format"
Benchmark 1: RUFF_NO_CACHE=true poetry run ruff format
  Time (mean ± σ):     300.7 ms ±  50.2 ms    [User: 259.5 ms, System: 76.1 ms]
  Range (min … max):   267.5 ms … 420.2 ms    10 runs
```

## Summary of changes
- Replace `black` with `ruff format` everywhere

- [0] https://docs.astral.sh/ruff/formatter/
- [1] https://docs.astral.sh/ruff/formatter/#black-compatibility
---
 .github/workflows/build_and_test.yml          |   8 +-
 docs/sourcetree.md                            |   8 +-
 poetry.lock                                   | 127 +++---------------
 pre-commit.py                                 |  24 ++--
 pyproject.toml                                |  16 +--
 scripts/export_import_between_pageservers.py  |   2 +-
 scripts/reformat                              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |   4 +-
 test_runner/fixtures/pageserver/http.py       |   4 +-
 test_runner/performance/test_perf_olap.py     |   7 +-
 .../performance/test_wal_backpressure.py      |   3 +-
 .../regress/test_attach_tenant_config.py      |  13 +-
 test_runner/regress/test_compatibility.py     |   8 +-
 test_runner/regress/test_crafted_wal_end.py   |   1 -
 test_runner/regress/test_layer_eviction.py    |   4 +-
 test_runner/regress/test_pageserver_api.py    |   3 +-
 test_runner/regress/test_tenant_detach.py     |  14 +-
 test_runner/regress/test_tenant_relocation.py |   6 +-
 .../test_tenants_with_remote_storage.py       |   8 +-
 19 files changed, 79 insertions(+), 185 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 78deff6e85..880d6044f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -105,11 +105,11 @@ jobs:
       - name: Install Python deps
         run: ./scripts/pysync
 
-      - name: Run ruff to ensure code format
-        run: poetry run ruff .
+      - name: Run `ruff check` to ensure code format
+        run: poetry run ruff check .
 
-      - name: Run black to ensure code format
-        run: poetry run black --diff --check .
+      - name: Run `ruff format` to ensure code format
+        run: poetry run ruff format --check .
 
       - name: Run mypy to check types
         run: poetry run mypy .
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 95bed83ae5..12fa80349e 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
 
 ### Obligatory checks
-We force code formatting via `black`, `ruff`, and type hints via `mypy`.
+We force code formatting via `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):
 
 ```bash
-poetry run black .  # All code is reformatted
-poetry run ruff .  # Python linter
-poetry run mypy .  # Ensure there are no typing errors
+poetry run ruff format . # All code is reformatted
+poetry run ruff check .  # Python linter
+poetry run mypy .        # Ensure there are no typing errors
 ```
 
 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
diff --git a/poetry.lock b/poetry.lock
index c597d811bd..428698cb5a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -288,55 +288,6 @@ files = [
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]
 
-[[package]]
-name = "black"
-version = "23.3.0"
-description = "The uncompromising code formatter."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
-    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
-    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
-    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
-    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
-    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
-    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
-    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
-    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
-    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
-    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
-    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
-    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
-    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-mypy-extensions = ">=0.4.3"
-packaging = ">=22.0"
-pathspec = ">=0.9.0"
-platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
-
-[package.extras]
-colorama = ["colorama (>=0.4.3)"]
-d = ["aiohttp (>=3.7.4)"]
-jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
-uvloop = ["uvloop (>=0.15.2)"]
-
 [[package]]
 name = "boto3"
 version = "1.34.11"
@@ -1627,17 +1578,6 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
-[[package]]
-name = "pathspec"
-version = "0.9.0"
-description = "Utility library for gitignore style pattern matching of file paths."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
-files = [
-    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
-    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
-]
-
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1649,21 +1589,6 @@ files = [
     {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
 ]
 
-[[package]]
-name = "platformdirs"
-version = "2.5.2"
-description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
-    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
-]
-
-[package.extras]
-docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
-test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
-
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -2207,28 +2132,28 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.0.269"
-description = "An extremely fast Python linter, written in Rust."
+version = "0.1.11"
+description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
-    {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
-    {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
-    {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
-    {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
-    {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
+    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
+    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
+    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
+    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
 ]
 
 [[package]]
@@ -2496,16 +2421,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2743,4 +2658,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"
+content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482"
diff --git a/pre-commit.py b/pre-commit.py
index dc0b9ed588..c5ed63ac44 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
     return cmd
 
 
-def black(fix_inplace: bool) -> str:
-    cmd = "poetry run black"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff_check(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff check"
+    if fix_inplace:
+        cmd += " --fix"
     return cmd
 
 
-def ruff(fix_inplace: bool) -> str:
-    cmd = "poetry run ruff"
-    if fix_inplace:
-        cmd += " --fix"
+def ruff_format(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff format"
+    if not fix_inplace:
+        cmd += " --diff --check"
     return cmd
 
 
@@ -109,16 +109,16 @@ if __name__ == "__main__":
         no_color=args.no_color,
     )
     check(
-        name="black",
+        name="ruff check",
         suffix=".py",
-        cmd=black(fix_inplace=args.fix_inplace),
+        cmd=ruff_check(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
     check(
-        name="ruff",
+        name="ruff format",
         suffix=".py",
-        cmd=ruff(fix_inplace=args.fix_inplace),
+        cmd=ruff_format(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
diff --git a/pyproject.toml b/pyproject.toml
index 18c8ece4a7..bb04123e05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,22 +40,13 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.3.0"
 mypy = "==1.3.0"
-ruff = "^0.0.269"
+ruff = "^0.1.11"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
-[tool.black]
-line-length = 100
-extend-exclude = '''
-/(
-    vendor
-)/
-'''
-
 [tool.mypy]
 exclude = "^vendor/"
 check_untyped_defs = true
@@ -82,7 +73,9 @@ ignore_missing_imports = true
 [tool.ruff]
 target-version = "py39"
 extend-exclude = ["vendor/"]
-ignore = ["E501"]
+ignore = [
+    "E501", # Line too long, we don't want to be too strict about it
+]
 select = [
     "E", # pycodestyle
     "F", # Pyflakes
@@ -90,3 +83,4 @@ select = [
     "W", # pycodestyle
     "B", # bugbear
 ]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index ff584bd4b0..980f343047 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
     If those files already exist, we will overwrite them.
     Returns basepath for files with captured output.
     """
-    assert type(cmd) is list
+    assert isinstance(cmd, list)
     base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
     basepath = os.path.join(capture_dir, base)
     stdout_filename = basepath + ".stdout"
diff --git a/scripts/reformat b/scripts/reformat
index 8688044f66..3533c4dcb8 100755
--- a/scripts/reformat
+++ b/scripts/reformat
@@ -6,5 +6,5 @@ set -euox pipefail
 echo 'Reformatting Rust code'
 cargo fmt
 echo 'Reformatting Python code'
-poetry run ruff --fix test_runner scripts
-poetry run black test_runner scripts
+poetry run ruff check --fix test_runner scripts
+poetry run ruff format test_runner scripts
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f33e17a76a..001d4e23a9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1101,8 +1101,8 @@ class AbstractNeonCli(abc.ABC):
         If `local_binpath` is true, then we are invoking a test utility
         """
 
-        assert type(arguments) == list
-        assert type(self.COMMAND) == str
+        assert isinstance(arguments, list)
+        assert isinstance(self.COMMAND, str)
 
         if local_binpath:
             # Test utility
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index ccacc0a987..a779dcc436 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -365,9 +365,9 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(res, dict)
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
-        assert type(size) == int
+        assert isinstance(size, int)
         inputs = res["inputs"]
-        assert type(inputs) is dict
+        assert isinstance(inputs, dict)
         return (size, inputs)
 
     def tenant_size_debug(self, tenant_id: TenantId) -> str:
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 1e6e9a0174..8a9509ea44 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
 # on https://benchmark.clickhouse.com/ (the DB size may differ).
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
 QUERIES: Tuple[LabelledQuery, ...] = (
-    # Disable `black` formatting for the list of queries so that it's easier to read
-    # fmt: off
     ### ClickBench queries:
     LabelledQuery("Q0",  r"SELECT COUNT(*) FROM hits;"),
     LabelledQuery("Q1",  r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # LabelledQuery("NQ0", r"..."),
     # LabelledQuery("NQ1", r"..."),
     # ...
-    # fmt: on
 )
+# fmt: on
 
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
 
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 3cb4b667ff..7eb244d378 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
     else:
         assert (
             len(x) == 2
-        ), f"request param ({request.param}) should have a format of \
-        `neon_{{safekeepers_enable_fsync}}`"
+        ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
 
         # `NeonCompare` interface
         neon_env_builder = request.getfixturevalue("neon_env_builder")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 32397bbcc1..ed389b1aa2 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
     assert set(our_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
-    assert {
-        k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
-        for k in fully_custom_config.keys()
-    } == {
-        k: True for k in fully_custom_config.keys()
-    }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
+    assert (
+        {
+            k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
+            for k in fully_custom_config.keys()
+        }
+        == {k: True for k in fully_custom_config.keys()}
+    ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
     ps_http.tenant_detach(tenant_id)
     env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5a9c2782e6..f9d6d0a934 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -186,9 +186,7 @@ def test_backward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 @check_ondisk_data_compatibility_if_enabled
@@ -247,9 +245,7 @@ def test_forward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 7ec901af34..01ecc2b95f 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -2,7 +2,6 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 
-
 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 2cd2406065..efba2033fb 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -102,9 +102,7 @@ def test_basic_eviction(
         ), f"Did not expect to find {local_layer} layer after evicting"
 
     empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    assert (
-        not empty_layers
-    ), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
+    assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
 
     evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
     assert (
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 573d2139ce..e29db1e252 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
     last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
     assert (
         prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
-    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
-        compared to the previous message's LSN {prev_msg_lsn}"
+    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
 
     return last_msg_lsn
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0dcbb23ad4..c6dbc77885 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
@@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 def test_detach_while_attaching(
@@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         if found_broken:
             break
         time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
 
     env.pageserver.tenant_load(env.initial_tenant)
 
@@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
             break
         time.sleep(0.5)
 
-    assert (
-        found_active
-    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
+    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index dcd7232b1b..1887bca23b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver(
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
         "metadata" in files_before_detach
-    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\
-            but got: {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
     assert (
         len(files_before_detach) >= 2
-    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\
-            but got {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
 
     return timeline_to_detach_local_path
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 07fb6dc5ca..6f05d7f7cb 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     restored_timeline = restored_timelines[0]
-    assert restored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        restored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Check that we had to retry the downloads
@@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     retored_timeline = restored_timelines[0]
-    assert retored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        retored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Request non-incremental logical size. Calculating it needs the layer file that

From a41c4122e39ab151f709262329576f9a9b5a2db7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 4 Jan 2024 01:21:33 +0300
Subject: [PATCH 16/21] Don't suspend compute if there is active LR subscriber.

https://github.com/neondatabase/neon/issues/6258
---
 compute_tools/src/monitor.rs | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index f974d6023d..fd19b7e53f 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
 
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use crate::compute::ComputeNode;
 
@@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     }
                 }
 
+                // If there are existing (logical) walsenders, do not suspend.
+                //
+                // walproposer doesn't currently show up in pg_stat_replication,
+                // but protect if it will be
+                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+                match cli.query_one(ws_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_ws) => {
+                            if num_ws > 0 {
+                                last_active = Some(Utc::now());
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse ws count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of walsenders: {:?}", e);
+                        continue;
+                    }
+                }
+
                 // Update the last activity in the shared state if we got a more recent one.
                 let mut state = compute.state.lock().unwrap();
                 // NB: `Some(<DateTime>)` is always greater than `None`.

From d1c0232e2190d1d0ddc9320223012196a8055ade Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jan 2024 03:53:15 +0000
Subject: [PATCH 17/21] pageserver: use `pub(crate)` in metrics.rs, and clean
 up unused items (#6275)

## Problem

Noticed while making other changes that there were `pub` items that were
unused.

## Summary of changes

- Make everything `pub(crate)` in metrics.rs, apart from items used from
`bin/`
- Fix the timelines eviction metric: it was never being incremented
- Remove an unused ephemeral_bytes counter.
---
 pageserver/src/metrics.rs                    | 84 +++++++++-----------
 pageserver/src/tenant/storage_layer/layer.rs |  1 +
 2 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c86adcfa3d..6f4431c3cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub enum StorageTimeOperation {
+pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]
     LayerFlush,
 
@@ -55,7 +55,7 @@ pub enum StorageTimeOperation {
     CreateTenant,
 }
 
-pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     register_counter_vec!(
         "pageserver_storage_operations_seconds_sum",
         "Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_storage_operations_seconds_count",
         "Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub struct PageCacheMetricsForTaskKind {
+pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
 
@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
     pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 
-pub struct PageCacheMetrics {
+pub(crate) struct PageCacheMetrics {
     map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }
 
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
         let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
         let task_kind: &'static str = task_kind.into();
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
     }
 }
 
-pub struct PageCacheSizeMetrics {
+pub(crate) struct PageCacheSizeMetrics {
     pub max_bytes: UIntGauge,
 
-    pub current_bytes_ephemeral: UIntGauge,
     pub current_bytes_immutable: UIntGauge,
     pub current_bytes_materialized_page: UIntGauge,
 }
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
+pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
+    Lazy::new(|| PageCacheSizeMetrics {
+        max_bytes: {
+            register_uint_gauge!(
+                "pageserver_page_cache_size_max_bytes",
+                "Maximum size of the page cache in bytes"
+            )
+            .expect("failed to define a metric")
+        },
+        current_bytes_immutable: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["immutable"])
+                .unwrap()
+        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
+    });
 
 pub(crate) mod page_cache_eviction_metrics {
     use std::num::NonZeroUsize;
@@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
 
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub struct EvictionsWithLowResidenceDuration {
+pub(crate) struct EvictionsWithLowResidenceDuration {
     data_source: &'static str,
     threshold: Duration,
     counter: Option<IntCounter>,
 }
 
-pub struct EvictionsWithLowResidenceDurationBuilder {
+pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
     data_source: &'static str,
     threshold: Duration,
 }
@@ -1009,7 +1003,7 @@ pub enum SmgrQueryType {
 }
 
 #[derive(Debug)]
-pub struct SmgrQueryTimePerTimeline {
+pub(crate) struct SmgrQueryTimePerTimeline {
     metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }
 
@@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
     .map(|ms| (ms as f64) / 1000.0)
 });
 
-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
     BasebackupQueryTime({
         register_histogram_vec!(
             "pageserver_basebackup_query_seconds",
@@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime {
     }
 }
 
-pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_live_connections",
         "Number of live network connections",
@@ -1667,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub struct StorageTimeMetricsTimer {
+pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
     start: Instant,
 }
@@ -1692,7 +1686,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub struct StorageTimeMetrics {
+pub(crate) struct StorageTimeMetrics {
     /// Sum of f64 seconds, per operation, tenant_id and timeline_id
     timeline_sum: Counter,
     /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1731,7 +1725,7 @@ impl StorageTimeMetrics {
 }
 
 #[derive(Debug)]
-pub struct TimelineMetrics {
+pub(crate) struct TimelineMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
@@ -1939,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
     }
 }
 
-pub struct RemoteTimelineClientMetrics {
+pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2237,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics {
 
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub trait MeasureRemoteOp: Sized {
+pub(crate) trait MeasureRemoteOp: Sized {
     fn measure_remote_op(
         self,
         tenant_id: TenantId,
@@ -2262,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}
 
 pin_project! {
-    pub struct MeasuredRemoteOp<F>
+    pub(crate) struct MeasuredRemoteOp<F>
     {
         #[pin]
         inner: F,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8ae911b31e..f5adf9d639 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1118,6 +1118,7 @@ impl LayerInner {
                         tracing::info!("evicted layer after unknown residence period");
                     }
                 }
+                timeline.metrics.evictions.inc();
                 timeline
                     .metrics
                     .resident_physical_size_sub(self.desc.file_size);

From b5ed6f22aedf5d663baf72dd0da85226ea6ccd1a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jan 2024 09:19:00 +0000
Subject: [PATCH 18/21] pageserver: clean up a TODO comment (#6282)

These functions don't need updating for sharding: it's fine for them to
remain shard-naive, as they're only used in the context of dumping a
layer file. The sharding metadata doesn't live in the layer file, it
lives in the index.
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 4 ++--
 pageserver/src/tenant/storage_layer/image_layer.rs | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index d339204127..4ded6d6a8d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -320,8 +320,8 @@ impl DeltaLayer {
             .metadata()
             .context("get file metadata to determine size")?;
 
-        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
-        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
+        // This function is never used for constructing layers in a running pageserver,
+        // so it does not need an accurate TenantShardId.
         let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
 
         Ok(DeltaLayer {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 023122c0b1..f03c7642eb 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -278,8 +278,8 @@ impl ImageLayer {
             .metadata()
             .context("get file metadata to determine size")?;
 
-        // TODO(sharding): we should get TenantShardId from path.
-        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
+        // This function is never used for constructing layers in a running pageserver,
+        // so it does not need an accurate TenantShardId.
         let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
 
         Ok(ImageLayer {

From b3a681d121625103fe8a2fddec44bfc7a77bb7bf Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 8 Jan 2024 09:19:10 +0000
Subject: [PATCH 19/21] s3_scrubber: updates for sharding (#6281)

This is a lightweight change to keep the scrubber providing sensible
output when using sharding.

- The timeline count was wrong when using sharding
- When checking for tenant existence, we didn't re-use results between
different shards in the same tenant

Closes: https://github.com/neondatabase/neon/issues/5929
---
 s3_scrubber/src/garbage.rs                    | 56 +++++++++++++++----
 s3_scrubber/src/scan_metadata.rs              | 38 +++++++++----
 .../regress/test_pageserver_generations.py    |  4 +-
 3 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/s3_scrubber/src/garbage.rs b/s3_scrubber/src/garbage.rs
index 7192afb91b..93bb115883 100644
--- a/s3_scrubber/src/garbage.rs
+++ b/s3_scrubber/src/garbage.rs
@@ -2,7 +2,10 @@
 //! S3 objects which are either not referenced by any metadata, or are referenced by a
 //! control plane tenant/timeline in a deleted state.
 
-use std::{collections::HashMap, sync::Arc};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
 
 use anyhow::Context;
 use aws_sdk_s3::{
@@ -118,6 +121,13 @@ const S3_CONCURRENCY: usize = 32;
 // How many concurrent API requests to make to the console API.
 const CONSOLE_CONCURRENCY: usize = 128;
 
+struct ConsoleCache {
+    /// Set of tenants found in the control plane API
+    projects: HashMap<TenantId, ProjectData>,
+    /// Set of tenants for which the control plane API returned 404
+    not_found: HashSet<TenantId>,
+}
+
 async fn find_garbage_inner(
     bucket_config: BucketConfig,
     console_config: ConsoleConfig,
@@ -143,23 +153,49 @@ async fn find_garbage_inner(
         console_projects.len()
     );
 
-    // TODO(sharding): batch calls into Console so that we only call once for each TenantId,
-    // rather than checking the same TenantId for multiple TenantShardId
+    // Because many tenant shards may look up the same TenantId, we maintain a cache.
+    let console_cache = Arc::new(std::sync::Mutex::new(ConsoleCache {
+        projects: console_projects,
+        not_found: HashSet::new(),
+    }));
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
     let tenants = stream_tenants(&s3_client, &target);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
-        let console_projects = &console_projects;
+        let console_cache = console_cache.clone();
         async move {
-            match console_projects.get(&t.tenant_id) {
+            // Check cache before issuing API call
+            let project_data = {
+                let cache = console_cache.lock().unwrap();
+                let result = cache.projects.get(&t.tenant_id).cloned();
+                if result.is_none() && cache.not_found.contains(&t.tenant_id) {
+                    return Ok((t, None));
+                }
+                result
+            };
+
+            match project_data {
                 Some(project_data) => Ok((t, Some(project_data.clone()))),
-                None => api_client
-                    .find_tenant_project(t.tenant_id)
-                    .await
-                    .map_err(|e| anyhow::anyhow!(e))
-                    .map(|r| (t, r)),
+                None => {
+                    let project_data = api_client
+                        .find_tenant_project(t.tenant_id)
+                        .await
+                        .map_err(|e| anyhow::anyhow!(e));
+
+                    // Populate cache with result of API call
+                    {
+                        let mut cache = console_cache.lock().unwrap();
+                        if let Ok(Some(project_data)) = &project_data {
+                            cache.projects.insert(t.tenant_id, project_data.clone());
+                        } else if let Ok(None) = &project_data {
+                            cache.not_found.insert(t.tenant_id);
+                        }
+                    }
+
+                    project_data.map(|r| (t, r))
+                }
             }
         }
     });
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index bfde8f0213..4b63bb3884 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -17,7 +17,9 @@ use utils::id::TenantId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
-    count: usize,
+    tenant_count: usize,
+    timeline_count: usize,
+    timeline_shard_count: usize,
     with_errors: HashSet<TenantShardTimelineId>,
     with_warnings: HashSet<TenantShardTimelineId>,
     with_orphans: HashSet<TenantShardTimelineId>,
@@ -87,7 +89,9 @@ impl MinMaxHisto {
 impl MetadataSummary {
     fn new() -> Self {
         Self {
-            count: 0,
+            tenant_count: 0,
+            timeline_count: 0,
+            timeline_shard_count: 0,
             with_errors: HashSet::new(),
             with_warnings: HashSet::new(),
             with_orphans: HashSet::new(),
@@ -112,7 +116,7 @@ impl MetadataSummary {
     }
 
     fn update_data(&mut self, data: &S3TimelineBlobData) {
-        self.count += 1;
+        self.timeline_shard_count += 1;
         if let BlobDataParseResult::Parsed {
             index_part,
             index_part_generation: _,
@@ -158,16 +162,20 @@ impl MetadataSummary {
         );
 
         format!(
-            "Timelines: {0}
-With errors: {1}
-With warnings: {2}
-With orphan layers: {3}
+            "Tenants: {}
+Timelines: {}
+Timeline-shards: {}
+With errors: {}
+With warnings: {}
+With orphan layers: {}
 Index versions: {version_summary}
-Timeline size bytes: {4}
-Layer size bytes: {5}
-Timeline layer count: {6}
+Timeline size bytes: {}
+Layer size bytes: {}
+Timeline layer count: {}
 ",
-            self.count,
+            self.tenant_count,
+            self.timeline_count,
+            self.timeline_shard_count,
             self.with_errors.len(),
             self.with_warnings.len(),
             self.with_orphans.len(),
@@ -182,7 +190,7 @@ Timeline layer count: {6}
     }
 
     pub fn is_empty(&self) -> bool {
-        self.count == 0
+        self.timeline_shard_count == 0
     }
 }
 
@@ -233,8 +241,12 @@ pub async fn scan_metadata(
         mut tenant_objects: TenantObjectListing,
         timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
     ) {
+        summary.tenant_count += 1;
+
+        let mut timeline_ids = HashSet::new();
         let mut timeline_generations = HashMap::new();
         for (ttid, data) in timelines {
+            timeline_ids.insert(ttid.timeline_id);
             // Stash the generation of each timeline, for later use identifying orphan layers
             if let BlobDataParseResult::Parsed {
                 index_part: _index_part,
@@ -252,6 +264,8 @@ pub async fn scan_metadata(
             summary.update_analysis(&ttid, &analysis);
         }
 
+        summary.timeline_count += timeline_ids.len();
+
         // Identifying orphan layers must be done on a tenant-wide basis, because individual
         // shards' layers may be referenced by other shards.
         //
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 9c2f5786d4..87a4fa01fc 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -254,7 +254,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     metadata_summary = S3Scrubber(
         neon_env_builder.test_output_dir, neon_env_builder
     ).scan_metadata()
-    assert metadata_summary["count"] == 1  # Scrubber should have seen our timeline
+    assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
+    assert metadata_summary["timeline_count"] == 1
+    assert metadata_summary["timeline_shard_count"] == 1
     assert not metadata_summary["with_errors"]
     assert not metadata_summary["with_warnings"]
 

From a4ac8e26e8fb08d936d00ef6a1db22377400689a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 8 Jan 2024 11:46:16 +0100
Subject: [PATCH 20/21] Update Rust to 1.75.0 (#6285)

[Release notes](https://github.com/rust-lang/rust/releases/tag/1.75.0).
---
 Dockerfile.buildtools | 2 +-
 rust-toolchain.toml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
index c2fcd8841e..213aed1679 100644
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
+ENV RUSTC_VERSION=1.75.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index b2cd21d85c..9b5a965f7d 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.74.0"
+channel = "1.75.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From 8a646cb75050147d6a03b163b0a959eeeaac77f1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 8 Jan 2024 11:42:43 +0000
Subject: [PATCH 21/21] proxy: add request context for observability and
 blocking (#6160)

## Summary of changes

### RequestMonitoring

We want to add an event stream with information on each request for
easier analysis than what we can do with diagnostic logs alone
(https://github.com/neondatabase/cloud/issues/8807). This
RequestMonitoring will keep a record of the final state of a request. On
drop it will be pushed into a queue to be uploaded.

Because this context is a bag of data, I don't want this information to
impact logic of request handling. I personally think that weakly typed
data (such as all these options) makes for spaghetti code. I will
however allow for this data to impact rate-limiting and blocking of
requests, as this does not _really_ change how a request is handled.

### Parquet

Each `RequestMonitoring` is flushed into a channel where it is converted
into `RequestData`, which is accumulated into parquet files. Each file
will have a certain number of rows per row group, and several row groups
will eventually fill up the file, which we then upload to S3.

We will also upload smaller files if they take too long to construct.
---
 Cargo.lock                            | 148 +++++-
 Cargo.toml                            |   8 +-
 proxy/Cargo.toml                      |   7 +-
 proxy/src/auth/backend.rs             |  62 ++-
 proxy/src/auth/credentials.rs         |  65 +--
 proxy/src/bin/pg_sni_router.rs        |  15 +-
 proxy/src/bin/proxy.rs                |  14 +
 proxy/src/compute.rs                  |   9 +-
 proxy/src/config.rs                   |   1 +
 proxy/src/console/provider.rs         |  13 +-
 proxy/src/console/provider/mock.rs    |   7 +-
 proxy/src/console/provider/neon.rs    |  29 +-
 proxy/src/context.rs                  | 110 +++++
 proxy/src/context/parquet.rs          | 641 ++++++++++++++++++++++++++
 proxy/src/error.rs                    |  34 ++
 proxy/src/lib.rs                      |   1 +
 proxy/src/metrics.rs                  |  10 +-
 proxy/src/proxy.rs                    |  83 ++--
 proxy/src/proxy/connect_compute.rs    |  34 +-
 proxy/src/proxy/tests.rs              |  28 +-
 proxy/src/serverless.rs               |  15 +-
 proxy/src/serverless/conn_pool.rs     |  63 ++-
 proxy/src/serverless/sql_over_http.rs |  40 +-
 proxy/src/serverless/websocket.rs     |  10 +-
 workspace_hack/Cargo.toml             |  11 +-
 25 files changed, 1209 insertions(+), 249 deletions(-)
 create mode 100644 proxy/src/context.rs
 create mode 100644 proxy/src/context/parquet.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4dd195a895..5f544a05c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,6 +30,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
  "cfg-if",
+ "const-random",
+ "getrandom 0.2.11",
  "once_cell",
  "version_check",
  "zerocopy",
@@ -50,6 +52,12 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
 
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -247,6 +255,12 @@ dependencies = [
  "syn 2.0.32",
 ]
 
+[[package]]
+name = "atomic"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
+
 [[package]]
 name = "atomic-polyfill"
 version = "1.0.2"
@@ -1011,17 +1025,17 @@ dependencies = [
 
 [[package]]
 name = "chrono"
-version = "0.4.24"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
 dependencies = [
+ "android-tzdata",
  "iana-time-zone",
  "js-sys",
- "num-integer",
  "num-traits",
  "serde",
  "wasm-bindgen",
- "winapi",
+ "windows-targets 0.48.0",
 ]
 
 [[package]]
@@ -2475,6 +2489,12 @@ dependencies = [
  "web-sys",
 ]
 
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@@ -2838,6 +2858,19 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "num"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2849,6 +2882,15 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-complex"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2859,6 +2901,28 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.15"
@@ -3081,6 +3145,15 @@ dependencies = [
  "tokio-stream",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3339,6 +3412,35 @@ dependencies = [
  "windows-targets 0.48.0",
 ]
 
+[[package]]
+name = "parquet"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+dependencies = [
+ "ahash",
+ "bytes",
+ "chrono",
+ "hashbrown 0.14.0",
+ "num",
+ "num-bigint",
+ "paste",
+ "seq-macro",
+ "thrift",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "parquet_derive"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+dependencies = [
+ "parquet",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "password-hash"
 version = "0.5.0"
@@ -3762,6 +3864,8 @@ dependencies = [
  "base64 0.13.1",
  "bstr",
  "bytes",
+ "camino",
+ "camino-tempfile",
  "chrono",
  "clap",
  "consumption_metrics",
@@ -3784,6 +3888,8 @@ dependencies = [
  "once_cell",
  "opentelemetry",
  "parking_lot 0.12.1",
+ "parquet",
+ "parquet_derive",
  "pbkdf2",
  "pin-project-lite",
  "postgres-native-tls",
@@ -3794,6 +3900,7 @@ dependencies = [
  "rand 0.8.5",
  "rcgen",
  "regex",
+ "remote_storage",
  "reqwest",
  "reqwest-middleware",
  "reqwest-retry",
@@ -4682,6 +4789,12 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "seq-macro"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
+
 [[package]]
 name = "serde"
 version = "1.0.183"
@@ -5202,6 +5315,17 @@ dependencies = [
  "once_cell",
 ]
 
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "ordered-float",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
@@ -5746,6 +5870,16 @@ dependencies = [
  "utf-8",
 ]
 
+[[package]]
+name = "twox-hash"
+version = "1.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
+dependencies = [
+ "cfg-if",
+ "static_assertions",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -5923,10 +6057,11 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.3.3"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
+checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
 dependencies = [
+ "atomic",
  "getrandom 0.2.11",
  "serde",
 ]
@@ -6422,6 +6557,7 @@ dependencies = [
  "num-integer",
  "num-traits",
  "once_cell",
+ "parquet",
  "prost",
  "rand 0.8.5",
  "regex",
diff --git a/Cargo.toml b/Cargo.toml
index 5de636778a..e9172809d7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,8 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
+parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -161,7 +163,7 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
-uuid = { version = "1.2", features = ["v4", "serde"] }
+uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
 x509-parser = "0.15"
@@ -215,6 +217,10 @@ tonic-build = "0.9"
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
+# bug fixes for UUID
+parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+
 ################# Binary contents sections
 
 [profile.release]
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index c94cd55417..5fdfd00a6a 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-default = []
+default = ["testing"]
 testing = []
 
 [dependencies]
@@ -14,6 +14,7 @@ async-trait.workspace = true
 base64.workspace = true
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
+camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
@@ -35,6 +36,8 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
+parquet.workspace = true
+parquet_derive.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -42,6 +45,7 @@ pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 reqwest = { workspace = true, features = ["json"] }
 reqwest-middleware.workspace = true
 reqwest-retry.workspace = true
@@ -80,6 +84,7 @@ smol_str.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
+camino-tempfile.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 923bd02560..d9bddff139 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,6 +10,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::AuthSecret;
+use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
 use crate::scram;
@@ -22,12 +23,10 @@ use crate::{
         provider::{CachedNodeInfo, ConsoleReqExtra},
         Api,
     },
-    metrics::LatencyTimer,
     stream, url,
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
-use std::net::IpAddr;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -129,7 +128,6 @@ pub struct ComputeCredentials<T> {
 
 pub struct ComputeUserInfoNoEndpoint {
     pub user: SmolStr,
-    pub peer_addr: IpAddr,
     pub cache_key: SmolStr,
 }
 
@@ -151,7 +149,6 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
     fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
         let inner = ComputeUserInfoNoEndpoint {
             user: creds.user,
-            peer_addr: creds.peer_addr,
             cache_key: creds.cache_key,
         };
         match creds.project {
@@ -166,33 +163,34 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
+    ctx: &mut RequestMonitoring,
     api: &impl console::Api,
-    extra: &ConsoleReqExtra,
     creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match creds.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
+            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
+                .await?;
+            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
             (res.info, Some(res.keys))
         }
         Ok(info) => (info, None),
     };
 
     info!("fetching user's authentication info");
-    let allowed_ips = api.get_allowed_ips(extra, &info).await?;
+    let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
 
     // check allowed list
-    if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
+    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let cached_secret = api.get_role_secret(extra, &info).await?;
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
 
     let secret = cached_secret.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
@@ -202,13 +200,13 @@ async fn auth_quirks(
         AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
     });
     match authenticate_with_secret(
+        ctx,
         secret,
         info,
         client,
         unauthenticated_password,
         allow_cleartext,
         config,
-        latency_timer,
     )
     .await
     {
@@ -224,13 +222,13 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
+    ctx: &mut RequestMonitoring,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     unauthenticated_password: Option<Vec<u8>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
@@ -253,38 +251,31 @@ async fn authenticate_with_secret(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
+        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, latency_timer, secret).await
+    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }
 
 /// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
 /// only if authentication was successfuly.
 async fn auth_and_wake_compute(
+    ctx: &mut RequestMonitoring,
     api: &impl console::Api,
     extra: &ConsoleReqExtra,
     creds: ClientCredentials,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials = auth_quirks(
-        api,
-        extra,
-        creds,
-        client,
-        allow_cleartext,
-        config,
-        latency_timer,
-    )
-    .await?;
+    let compute_credentials = auth_quirks(ctx, api, creds, client, allow_cleartext, config).await?;
 
     let mut num_retries = 0;
     let mut node = loop {
-        let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
+        let wake_res = api
+            .wake_compute(ctx, extra, &compute_credentials.info)
+            .await;
         match handle_try_wake(wake_res, num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -301,6 +292,8 @@ async fn auth_and_wake_compute(
         tokio::time::sleep(wait_duration).await;
     };
 
+    ctx.set_project(node.aux.clone());
+
     match compute_credentials.keys {
         #[cfg(feature = "testing")]
         ComputeCredentialKeys::Password(password) => node.config.password(password),
@@ -343,11 +336,11 @@ impl<'a> BackendType<'a, ClientCredentials> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub async fn authenticate(
         self,
+        ctx: &mut RequestMonitoring,
         extra: &ConsoleReqExtra,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
-        latency_timer: &mut LatencyTimer,
     ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
         use BackendType::*;
 
@@ -360,13 +353,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
                 );
 
                 let (cache_info, user_info) = auth_and_wake_compute(
+                    ctx,
                     &*api,
                     extra,
                     creds,
                     client,
                     allow_cleartext,
                     config,
-                    latency_timer,
                 )
                 .await?;
                 (cache_info, BackendType::Console(api, user_info))
@@ -380,13 +373,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
                 );
 
                 let (cache_info, user_info) = auth_and_wake_compute(
+                    ctx,
                     &*api,
                     extra,
                     creds,
                     client,
                     allow_cleartext,
                     config,
-                    latency_timer,
                 )
                 .await?;
                 (cache_info, BackendType::Postgres(api, user_info))
@@ -416,13 +409,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
 impl BackendType<'_, ComputeUserInfo> {
     pub async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         use BackendType::*;
         match self {
-            Console(api, creds) => api.get_allowed_ips(extra, creds).await,
+            Console(api, creds) => api.get_allowed_ips(ctx, creds).await,
             #[cfg(feature = "testing")]
-            Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
+            Postgres(api, creds) => api.get_allowed_ips(ctx, creds).await,
             Link(_) => Ok(Arc::new(vec![])),
             #[cfg(test)]
             Test(x) => x.get_allowed_ips(),
@@ -433,14 +426,15 @@ impl BackendType<'_, ComputeUserInfo> {
     /// The link auth flow doesn't support this, so we return [`None`] in that case.
     pub async fn wake_compute(
         &self,
+        ctx: &mut RequestMonitoring,
         extra: &ConsoleReqExtra,
     ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
         use BackendType::*;
 
         match self {
-            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
+            Console(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await,
             #[cfg(feature = "testing")]
-            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
+            Postgres(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await,
             Link(_) => Ok(None),
             #[cfg(test)]
             Test(x) => x.wake_compute().map(Some),
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index c04769a199..4ddfa722e1 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,7 +1,7 @@
 //! User credentials used in authentication.
 
 use crate::{
-    auth::password_hack::parse_endpoint_param, error::UserFacingError,
+    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
     metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
 };
 use itertools::Itertools;
@@ -44,7 +44,6 @@ pub struct ClientCredentials {
     pub project: Option<SmolStr>,
 
     pub cache_key: SmolStr,
-    pub peer_addr: IpAddr,
 }
 
 impl ClientCredentials {
@@ -56,16 +55,21 @@ impl ClientCredentials {
 
 impl ClientCredentials {
     pub fn parse(
+        ctx: &mut RequestMonitoring,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<HashSet<String>>,
-        peer_addr: IpAddr,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
         // Some parameters are stored in the startup message.
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user = get_param("user")?.into();
+        let user: SmolStr = get_param("user")?.into();
+
+        // record the values if we have them
+        ctx.set_application(params.get("application_name").map(SmolStr::from));
+        ctx.set_user(user.clone());
+        ctx.set_endpoint_id(sni.map(SmolStr::from));
 
         // Project name might be passed via PG's command-line options.
         let project_option = params
@@ -147,7 +151,6 @@ impl ClientCredentials {
             user,
             project,
             cache_key,
-            peer_addr,
         })
     }
 }
@@ -219,8 +222,8 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -234,8 +237,8 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -249,8 +252,8 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("foo"));
         assert_eq!(creds.cache_key, "foo");
@@ -265,8 +268,8 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
 
@@ -280,8 +283,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
 
@@ -298,8 +301,8 @@ mod tests {
             ),
         ]);
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert!(creds.project.is_none());
 
@@ -313,8 +316,8 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert!(creds.project.is_none());
 
@@ -328,8 +331,8 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("baz"));
 
@@ -342,14 +345,14 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
         assert_eq!(creds.project.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
         assert_eq!(creds.project.as_deref(), Some("p1"));
 
         Ok(())
@@ -363,8 +366,8 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+        let mut ctx = RequestMonitoring::test();
+        let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names)
             .expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
@@ -382,8 +385,8 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+        let mut ctx = RequestMonitoring::test();
+        let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names)
             .expect_err("should fail");
         match err {
             UnknownCommonName { cn } => {
@@ -402,8 +405,8 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
+        let mut ctx = RequestMonitoring::test();
+        let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
         assert_eq!(creds.project.as_deref(), Some("project"));
         assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
 
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index d48ba3a54e..1edbc1e7e7 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc};
 use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
+use proxy::context::RequestMonitoring;
 use proxy::proxy::run_until_cancelled;
 use tokio::net::TcpListener;
 
@@ -170,7 +171,16 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
+                let mut ctx =
+                    RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                handle_client(
+                    &mut ctx,
+                    dest_suffix,
+                    tls_config,
+                    tls_server_end_point,
+                    socket,
+                )
+                .await
             }
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
@@ -236,6 +246,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
+    ctx: &mut RequestMonitoring,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -261,5 +272,5 @@ async fn handle_client(
     let client = tokio::net::TcpStream::connect(destination).await?;
 
     let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
+    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 5bc2d377a6..d42906aa4a 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,7 @@ use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
 use proxy::console::provider::RoleSecretCache;
+use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
@@ -44,6 +45,9 @@ enum AuthBackend {
 #[derive(Parser)]
 #[command(version = GIT_VERSION, about)]
 struct ProxyCliArgs {
+    /// Name of the region this proxy is deployed in
+    #[clap(long, default_value_t = String::new())]
+    region: String,
     /// listen for incoming client connections on ip:port
     #[clap(short, long, default_value = "127.0.0.1:4432")]
     proxy: String,
@@ -133,6 +137,9 @@ struct ProxyCliArgs {
     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_ip_check_for_http: bool,
+
+    #[clap(flatten)]
+    parquet_upload: ParquetUploadArgs,
 }
 
 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -221,6 +228,11 @@ async fn main() -> anyhow::Result<()> {
         ));
     }
 
+    client_tasks.spawn(proxy::context::parquet::worker(
+        cancellation_token.clone(),
+        args.parquet_upload,
+    ));
+
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
     maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
@@ -380,6 +392,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        // TODO: add this argument
+        region: args.region.clone(),
     }));
 
     Ok(config)
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a54ba56e43..9a5abe2960 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,7 @@
 use crate::{
     auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
-    error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
+    context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
+    proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -232,9 +233,9 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub async fn connect(
         &self,
+        ctx: &mut RequestMonitoring,
         allow_self_signed_compute: bool,
         timeout: Duration,
-        proto: &'static str,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
 
@@ -268,7 +269,9 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
-            _guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
+            _guage: NUM_DB_CONNECTIONS_GAUGE
+                .with_label_values(&[ctx.protocol])
+                .guard(),
         };
 
         Ok(connection)
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 610bf7e424..90956f84d3 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
+    pub region: String,
 }
 
 #[derive(Debug)]
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e4cf1e8c8e..974384bd5b 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -6,7 +6,9 @@ use super::messages::MetricsAuxInfo;
 use crate::{
     auth::backend::ComputeUserInfo,
     cache::{timed_lru, TimedLru},
-    compute, scram,
+    compute,
+    context::RequestMonitoring,
+    scram,
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
@@ -198,10 +200,6 @@ pub mod errors {
 
 /// Extra query params we'd like to pass to the console.
 pub struct ConsoleReqExtra {
-    /// A unique identifier for a connection.
-    pub session_id: uuid::Uuid,
-    /// Name of client application, if set.
-    pub application_name: String,
     pub options: Vec<(String, String)>,
 }
 
@@ -263,19 +261,20 @@ pub trait Api {
     /// Get the client's auth secret for authentication.
     async fn get_role_secret(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
+        ctx: &mut RequestMonitoring,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index dba5e5863f..fa61ec3221 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,8 +6,8 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
-use crate::console::provider::CachedRoleSecret;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{console::provider::CachedRoleSecret, context::RequestMonitoring};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use thiserror::Error;
@@ -145,7 +145,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _extra: &ConsoleReqExtra,
+        _ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -155,7 +155,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips(
         &self,
-        _extra: &ConsoleReqExtra,
+        _ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
@@ -164,6 +164,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
+        _ctx: &mut RequestMonitoring,
         _extra: &ConsoleReqExtra,
         _creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5bf7b0f986..7867a1e933 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -6,8 +6,11 @@ use super::{
     ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
     NodeInfo,
 };
-use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
+use crate::{
+    context::RequestMonitoring,
+    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
+};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
@@ -49,19 +52,20 @@ impl Api {
 
     async fn do_get_auth_info(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         let request_id = uuid::Uuid::new_v4().to_string();
+        let application_name = ctx.console_application_name();
         async {
             let request = self
                 .endpoint
                 .get("proxy_get_role_secret")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", extra.session_id)])
+                .query(&[("session_id", ctx.session_id)])
                 .query(&[
-                    ("application_name", extra.application_name.as_str()),
+                    ("application_name", application_name.as_str()),
                     ("project", creds.endpoint.as_str()),
                     ("role", creds.inner.user.as_str()),
                 ])
@@ -102,19 +106,21 @@ impl Api {
 
     async fn do_wake_compute(
         &self,
+        ctx: &mut RequestMonitoring,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
         let request_id = uuid::Uuid::new_v4().to_string();
+        let application_name = ctx.console_application_name();
         async {
             let mut request_builder = self
                 .endpoint
                 .get("proxy_wake_compute")
                 .header("X-Request-ID", &request_id)
                 .header("Authorization", format!("Bearer {}", &self.jwt))
-                .query(&[("session_id", extra.session_id)])
+                .query(&[("session_id", ctx.session_id)])
                 .query(&[
-                    ("application_name", extra.application_name.as_str()),
+                    ("application_name", application_name.as_str()),
                     ("project", creds.endpoint.as_str()),
                 ]);
 
@@ -162,7 +168,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let ep = creds.endpoint.clone();
@@ -170,7 +176,7 @@ impl super::Api for Api {
         if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
             return Ok(role_secret);
         }
-        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let auth_info = self.do_get_auth_info(ctx, creds).await?;
         let (_, secret) = self
             .caches
             .role_secret
@@ -183,7 +189,7 @@ impl super::Api for Api {
 
     async fn get_allowed_ips(
         &self,
-        extra: &ConsoleReqExtra,
+        ctx: &mut RequestMonitoring,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
         if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
@@ -195,7 +201,7 @@ impl super::Api for Api {
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
             .inc();
-        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let auth_info = self.do_get_auth_info(ctx, creds).await?;
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let ep = creds.endpoint.clone();
         let user = creds.inner.user.clone();
@@ -209,6 +215,7 @@ impl super::Api for Api {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
+        ctx: &mut RequestMonitoring,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -236,7 +243,7 @@ impl super::Api for Api {
             }
         }
 
-        let node = self.do_wake_compute(extra, creds).await?;
+        let node = self.do_wake_compute(ctx, extra, creds).await?;
         let (_, cached) = self.caches.node_info.insert(key.clone(), node);
         info!(key = &*key, "created a cache entry for compute node info");
 
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
new file mode 100644
index 0000000000..47449cf59a
--- /dev/null
+++ b/proxy/src/context.rs
@@ -0,0 +1,110 @@
+//! Connection request monitoring contexts
+
+use chrono::Utc;
+use once_cell::sync::OnceCell;
+use smol_str::SmolStr;
+use std::net::IpAddr;
+use tokio::sync::mpsc;
+use uuid::Uuid;
+
+use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
+
+pub mod parquet;
+
+static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
+
+#[derive(Clone)]
+/// Context data for a single request to connect to a database.
+///
+/// This data should **not** be used for connection logic, only for observability and limiting purposes.
+/// All connection logic should instead use strongly typed state machines, not a bunch of Options.
+pub struct RequestMonitoring {
+    pub peer_addr: IpAddr,
+    pub session_id: Uuid,
+    pub protocol: &'static str,
+    first_packet: chrono::DateTime<Utc>,
+    region: &'static str,
+
+    // filled in as they are discovered
+    project: Option<SmolStr>,
+    branch: Option<SmolStr>,
+    endpoint_id: Option<SmolStr>,
+    user: Option<SmolStr>,
+    application: Option<SmolStr>,
+    error_kind: Option<ErrorKind>,
+
+    // extra
+    // This sender is here to keep the request monitoring channel open while requests are taking place.
+    sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
+    pub latency_timer: LatencyTimer,
+}
+
+impl RequestMonitoring {
+    pub fn new(
+        session_id: Uuid,
+        peer_addr: IpAddr,
+        protocol: &'static str,
+        region: &'static str,
+    ) -> Self {
+        Self {
+            peer_addr,
+            session_id,
+            protocol,
+            first_packet: Utc::now(),
+            region,
+
+            project: None,
+            branch: None,
+            endpoint_id: None,
+            user: None,
+            application: None,
+            error_kind: None,
+
+            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
+            latency_timer: LatencyTimer::new(protocol),
+        }
+    }
+
+    #[cfg(test)]
+    pub fn test() -> Self {
+        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
+    }
+
+    pub fn console_application_name(&self) -> String {
+        format!(
+            "{}/{}",
+            self.application.as_deref().unwrap_or_default(),
+            self.protocol
+        )
+    }
+
+    pub fn set_project(&mut self, x: MetricsAuxInfo) {
+        self.branch = Some(x.branch_id);
+        self.endpoint_id = Some(x.endpoint_id);
+        self.project = Some(x.project_id);
+    }
+
+    pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
+        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+    }
+
+    pub fn set_application(&mut self, app: Option<SmolStr>) {
+        self.application = app.or_else(|| self.application.clone());
+    }
+
+    pub fn set_user(&mut self, user: SmolStr) {
+        self.user = Some(user);
+    }
+
+    pub fn log(&mut self) {
+        if let Some(tx) = self.sender.take() {
+            let _: Result<(), _> = tx.send(self.clone());
+        }
+    }
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
+        self.log()
+    }
+}
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
new file mode 100644
index 0000000000..ca4eff5ddf
--- /dev/null
+++ b/proxy/src/context/parquet.rs
@@ -0,0 +1,641 @@
+use std::sync::Arc;
+
+use anyhow::Context;
+use bytes::BytesMut;
+use futures::{Stream, StreamExt};
+use parquet::{
+    basic::Compression,
+    file::{
+        metadata::RowGroupMetaDataPtr,
+        properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE},
+        writer::SerializedFileWriter,
+    },
+    record::RecordWriter,
+};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+use tokio::{sync::mpsc, time};
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, info, Span};
+use utils::backoff;
+
+use super::{RequestMonitoring, LOG_CHAN};
+
+#[derive(clap::Args, Clone, Debug)]
+pub struct ParquetUploadArgs {
+    /// Storage location to upload the parquet files to.
+    /// Encoded as toml (same format as pageservers), eg
+    /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
+    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
+    parquet_upload_remote_storage: OptRemoteStorageConfig,
+
+    /// How many rows to include in a row group
+    #[clap(long, default_value_t = 8192)]
+    parquet_upload_row_group_size: usize,
+
+    /// How large each column page should be in bytes
+    #[clap(long, default_value_t = DEFAULT_PAGE_SIZE)]
+    parquet_upload_page_size: usize,
+
+    /// How large the total parquet file should be in bytes
+    #[clap(long, default_value_t = 100_000_000)]
+    parquet_upload_size: i64,
+
+    /// How long to wait before forcing a file upload
+    #[clap(long, default_value = "20m", value_parser = humantime::parse_duration)]
+    parquet_upload_maximum_duration: tokio::time::Duration,
+
+    /// What level of compression to use
+    #[clap(long, default_value_t = Compression::UNCOMPRESSED)]
+    parquet_upload_compression: Compression,
+}
+
+/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
+/// runtime type errors from the value parser we use.
+type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
+
+fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+    RemoteStorageConfig::from_toml(&s.parse()?)
+}
+
+// Occasional network issues and such can cause remote operations to fail, and
+// that's expected. If a upload fails, we log it at info-level, and retry.
+// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
+// level instead, as repeated failures can mean a more serious problem. If it
+// fails more than FAILED_UPLOAD_RETRIES times, we give up
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
+
+// the parquet crate leaves a lot to be desired...
+// what follows is an attempt to write parquet files with minimal allocs.
+// complication: parquet is a columnar format, while we want to write in as rows.
+// design:
+// * we batch up to 1024 rows, then flush them into a 'row group'
+// * after each rowgroup write, we check the length of the file and upload to s3 if large enough
+
+#[derive(parquet_derive::ParquetRecordWriter)]
+struct RequestData {
+    region: &'static str,
+    protocol: &'static str,
+    /// Must be UTC. The derive macro doesn't like the timezones
+    timestamp: chrono::NaiveDateTime,
+    session_id: uuid::Uuid,
+    peer_addr: String,
+    username: Option<String>,
+    application_name: Option<String>,
+    endpoint_id: Option<String>,
+    project: Option<String>,
+    branch: Option<String>,
+    error: Option<&'static str>,
+}
+
+impl From<RequestMonitoring> for RequestData {
+    fn from(value: RequestMonitoring) -> Self {
+        Self {
+            session_id: value.session_id,
+            peer_addr: value.peer_addr.to_string(),
+            timestamp: value.first_packet.naive_utc(),
+            username: value.user.as_deref().map(String::from),
+            application_name: value.application.as_deref().map(String::from),
+            endpoint_id: value.endpoint_id.as_deref().map(String::from),
+            project: value.project.as_deref().map(String::from),
+            branch: value.branch.as_deref().map(String::from),
+            protocol: value.protocol,
+            region: value.region,
+            error: value.error_kind.as_ref().map(|e| e.to_str()),
+        }
+    }
+}
+
+/// Parquet request context worker
+///
+/// It listened on a channel for all completed requests, extracts the data and writes it into a parquet file,
+/// then uploads a completed batch to S3
+pub async fn worker(
+    cancellation_token: CancellationToken,
+    config: ParquetUploadArgs,
+) -> anyhow::Result<()> {
+    let Some(remote_storage_config) = config.parquet_upload_remote_storage else {
+        tracing::warn!("parquet request upload: no s3 bucket configured");
+        return Ok(());
+    };
+
+    let (tx, mut rx) = mpsc::unbounded_channel();
+    LOG_CHAN.set(tx.downgrade()).unwrap();
+
+    // setup row stream that will close on cancellation
+    tokio::spawn(async move {
+        cancellation_token.cancelled().await;
+        // dropping this sender will cause the channel to close only once
+        // all the remaining inflight requests have been completed.
+        drop(tx);
+    });
+    let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
+    let rx = rx.map(RequestData::from);
+
+    let storage =
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
+
+    let properties = WriterProperties::builder()
+        .set_data_page_size_limit(config.parquet_upload_page_size)
+        .set_compression(config.parquet_upload_compression);
+
+    let parquet_config = ParquetConfig {
+        propeties: Arc::new(properties.build()),
+        rows_per_group: config.parquet_upload_row_group_size,
+        file_size: config.parquet_upload_size,
+        max_duration: config.parquet_upload_maximum_duration,
+
+        #[cfg(any(test, feature = "testing"))]
+        test_remote_failures: 0,
+    };
+
+    worker_inner(storage, rx, parquet_config).await
+}
+
+struct ParquetConfig {
+    propeties: WriterPropertiesPtr,
+    rows_per_group: usize,
+    file_size: i64,
+
+    max_duration: tokio::time::Duration,
+
+    #[cfg(any(test, feature = "testing"))]
+    test_remote_failures: u64,
+}
+
+async fn worker_inner(
+    storage: GenericRemoteStorage,
+    rx: impl Stream<Item = RequestData>,
+    config: ParquetConfig,
+) -> anyhow::Result<()> {
+    #[cfg(any(test, feature = "testing"))]
+    let storage = if config.test_remote_failures > 0 {
+        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
+    } else {
+        storage
+    };
+
+    let mut rx = std::pin::pin!(rx);
+
+    let mut rows = Vec::with_capacity(config.rows_per_group);
+
+    let schema = rows.as_slice().schema()?;
+    let file = BytesWriter::default();
+    let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
+
+    let mut last_upload = time::Instant::now();
+
+    let mut len = 0;
+    while let Some(row) = rx.next().await {
+        rows.push(row);
+        let force = last_upload.elapsed() > config.max_duration;
+        if rows.len() == config.rows_per_group || force {
+            let rg_meta;
+            (rows, w, rg_meta) = flush_rows(rows, w).await?;
+            len += rg_meta.compressed_size();
+        }
+        if len > config.file_size || force {
+            last_upload = time::Instant::now();
+            let file = upload_parquet(w, len, &storage).await?;
+            w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
+            len = 0;
+        }
+    }
+
+    if !rows.is_empty() {
+        let rg_meta;
+        (_, w, rg_meta) = flush_rows(rows, w).await?;
+        len += rg_meta.compressed_size();
+    }
+
+    if !w.flushed_row_groups().is_empty() {
+        let _: BytesWriter = upload_parquet(w, len, &storage).await?;
+    }
+
+    Ok(())
+}
+
+async fn flush_rows(
+    rows: Vec<RequestData>,
+    mut w: SerializedFileWriter<BytesWriter>,
+) -> anyhow::Result<(
+    Vec<RequestData>,
+    SerializedFileWriter<BytesWriter>,
+    RowGroupMetaDataPtr,
+)> {
+    let span = Span::current();
+    let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || {
+        let _enter = span.enter();
+
+        let mut rg = w.next_row_group()?;
+        rows.as_slice().write_to_row_group(&mut rg)?;
+        let rg_meta = rg.close()?;
+
+        let size = rg_meta.compressed_size();
+        let compression = rg_meta.compressed_size() as f64 / rg_meta.total_byte_size() as f64;
+
+        debug!(size, compression, "flushed row group to parquet file");
+
+        Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta))
+    })
+    .await
+    .unwrap()?;
+
+    rows.clear();
+    Ok((rows, w, rg_meta))
+}
+
+async fn upload_parquet(
+    w: SerializedFileWriter<BytesWriter>,
+    len: i64,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<BytesWriter> {
+    let len_uncompressed = w
+        .flushed_row_groups()
+        .iter()
+        .map(|rg| rg.total_byte_size())
+        .sum::<i64>();
+
+    // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
+    // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
+    let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish())
+        .await
+        .unwrap()?;
+
+    let data = file.buf.split().freeze();
+
+    let compression = len as f64 / len_uncompressed as f64;
+    let size = data.len();
+    let id = uuid::Uuid::now_v7();
+
+    info!(
+        %id,
+        rows = metadata.num_rows,
+        size, compression, "uploading request parquet file"
+    );
+
+    let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
+    backoff::retry(
+        || async {
+            let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
+            storage.upload(stream, data.len(), &path, None).await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_UPLOAD_MAX_RETRIES,
+        "request_data_upload",
+        // we don't want cancellation to interrupt here, so we make a dummy cancel token
+        backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
+    )
+    .await
+    .context("request_data_upload")?;
+
+    Ok(file)
+}
+
+// why doesn't BytesMut impl io::Write?
+#[derive(Default)]
+struct BytesWriter {
+    buf: BytesMut,
+}
+
+impl std::io::Write for BytesWriter {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.buf.extend_from_slice(buf);
+        Ok(buf.len())
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc};
+
+    use camino::Utf8Path;
+    use clap::Parser;
+    use futures::{Stream, StreamExt};
+    use itertools::Itertools;
+    use parquet::{
+        basic::{Compression, ZstdLevel},
+        file::{
+            properties::{WriterProperties, DEFAULT_PAGE_SIZE},
+            reader::FileReader,
+            serialized_reader::SerializedFileReader,
+        },
+    };
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use remote_storage::{
+        GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
+        DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+    };
+    use tokio::{sync::mpsc, time};
+
+    use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
+
+    #[derive(Parser)]
+    struct ProxyCliArgs {
+        #[clap(flatten)]
+        parquet_upload: ParquetUploadArgs,
+    }
+
+    #[test]
+    fn default_parser() {
+        let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from(["proxy"]);
+        assert_eq!(parquet_upload.parquet_upload_remote_storage, None);
+        assert_eq!(parquet_upload.parquet_upload_row_group_size, 8192);
+        assert_eq!(parquet_upload.parquet_upload_page_size, DEFAULT_PAGE_SIZE);
+        assert_eq!(parquet_upload.parquet_upload_size, 100_000_000);
+        assert_eq!(
+            parquet_upload.parquet_upload_maximum_duration,
+            time::Duration::from_secs(20 * 60)
+        );
+        assert_eq!(
+            parquet_upload.parquet_upload_compression,
+            Compression::UNCOMPRESSED
+        );
+    }
+
+    #[test]
+    fn full_parser() {
+        let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from([
+            "proxy",
+            "--parquet-upload-remote-storage",
+            "{bucket_name='default',prefix_in_bucket='proxy/',bucket_region='us-east-1',endpoint='http://minio:9000'}",
+            "--parquet-upload-row-group-size",
+            "100",
+            "--parquet-upload-page-size",
+            "10000",
+            "--parquet-upload-size",
+            "10000000",
+            "--parquet-upload-maximum-duration",
+            "10m",
+            "--parquet-upload-compression",
+            "zstd(5)",
+        ]);
+        assert_eq!(
+            parquet_upload.parquet_upload_remote_storage,
+            Some(RemoteStorageConfig {
+                storage: RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: "default".into(),
+                    bucket_region: "us-east-1".into(),
+                    prefix_in_bucket: Some("proxy/".into()),
+                    endpoint: Some("http://minio:9000".into()),
+                    concurrency_limit: NonZeroUsize::new(
+                        DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+                    )
+                    .unwrap(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                })
+            })
+        );
+        assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
+        assert_eq!(parquet_upload.parquet_upload_page_size, 10000);
+        assert_eq!(parquet_upload.parquet_upload_size, 10_000_000);
+        assert_eq!(
+            parquet_upload.parquet_upload_maximum_duration,
+            time::Duration::from_secs(10 * 60)
+        );
+        assert_eq!(
+            parquet_upload.parquet_upload_compression,
+            Compression::ZSTD(ZstdLevel::try_new(5).unwrap())
+        );
+    }
+
+    fn generate_request_data(rng: &mut impl Rng) -> RequestData {
+        RequestData {
+            session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
+            peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
+            timestamp: chrono::NaiveDateTime::from_timestamp_millis(
+                rng.gen_range(1703862754..1803862754),
+            )
+            .unwrap(),
+            application_name: Some("test".to_owned()),
+            username: Some(hex::encode(rng.gen::<[u8; 4]>())),
+            endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            project: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
+            region: "us-east-1",
+            error: None,
+        }
+    }
+
+    fn random_stream(len: usize) -> impl Stream<Item = RequestData> + Unpin {
+        let mut rng = StdRng::from_seed([0x39; 32]);
+        futures::stream::iter(
+            std::iter::repeat_with(move || generate_request_data(&mut rng)).take(len),
+        )
+    }
+
+    async fn run_test(
+        tmpdir: &Utf8Path,
+        config: ParquetConfig,
+        rx: impl Stream<Item = RequestData>,
+    ) -> Vec<(u64, usize, i64)> {
+        let remote_storage_config = RemoteStorageConfig {
+            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+        };
+        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
+
+        worker_inner(storage, rx, config).await.unwrap();
+
+        let mut files = std::fs::read_dir(tmpdir.as_std_path())
+            .unwrap()
+            .map(|entry| entry.unwrap().path())
+            .collect_vec();
+        files.sort();
+
+        files
+            .into_iter()
+            .map(|path| std::fs::File::open(tmpdir.as_std_path().join(path)).unwrap())
+            .map(|file| {
+                (
+                    file.metadata().unwrap(),
+                    SerializedFileReader::new(file).unwrap().metadata().clone(),
+                )
+            })
+            .map(|(file_meta, parquet_meta)| {
+                (
+                    file_meta.len(),
+                    parquet_meta.num_row_groups(),
+                    parquet_meta.file_metadata().num_rows(),
+                )
+            })
+            .collect()
+    }
+
+    #[tokio::test]
+    async fn verify_parquet_no_compression() {
+        let tmpdir = camino_tempfile::tempdir().unwrap();
+
+        let config = ParquetConfig {
+            propeties: Arc::new(WriterProperties::new()),
+            rows_per_group: 2_000,
+            file_size: 1_000_000,
+            max_duration: time::Duration::from_secs(20 * 60),
+            test_remote_failures: 0,
+        };
+
+        let rx = random_stream(50_000);
+        let file_stats = run_test(tmpdir.path(), config, rx).await;
+
+        assert_eq!(
+            file_stats,
+            [
+                (1029153, 3, 6000),
+                (1029075, 3, 6000),
+                (1029216, 3, 6000),
+                (1029129, 3, 6000),
+                (1029250, 3, 6000),
+                (1029017, 3, 6000),
+                (1029175, 3, 6000),
+                (1029247, 3, 6000),
+                (343124, 1, 2000)
+            ],
+        );
+
+        tmpdir.close().unwrap();
+    }
+
+    #[tokio::test]
+    async fn verify_parquet_min_compression() {
+        let tmpdir = camino_tempfile::tempdir().unwrap();
+
+        let config = ParquetConfig {
+            propeties: Arc::new(
+                WriterProperties::builder()
+                    .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default()))
+                    .build(),
+            ),
+            rows_per_group: 2_000,
+            file_size: 1_000_000,
+            max_duration: time::Duration::from_secs(20 * 60),
+            test_remote_failures: 0,
+        };
+
+        let rx = random_stream(50_000);
+        let file_stats = run_test(tmpdir.path(), config, rx).await;
+
+        // with compression, there are fewer files with more rows per file
+        assert_eq!(
+            file_stats,
+            [
+                (1166201, 6, 12000),
+                (1163577, 6, 12000),
+                (1164641, 6, 12000),
+                (1168772, 6, 12000),
+                (196761, 1, 2000)
+            ],
+        );
+
+        tmpdir.close().unwrap();
+    }
+
+    #[tokio::test]
+    async fn verify_parquet_strong_compression() {
+        let tmpdir = camino_tempfile::tempdir().unwrap();
+
+        let config = ParquetConfig {
+            propeties: Arc::new(
+                WriterProperties::builder()
+                    .set_compression(parquet::basic::Compression::ZSTD(
+                        ZstdLevel::try_new(10).unwrap(),
+                    ))
+                    .build(),
+            ),
+            rows_per_group: 2_000,
+            file_size: 1_000_000,
+            max_duration: time::Duration::from_secs(20 * 60),
+            test_remote_failures: 0,
+        };
+
+        let rx = random_stream(50_000);
+        let file_stats = run_test(tmpdir.path(), config, rx).await;
+
+        // with strong compression, the files are smaller
+        assert_eq!(
+            file_stats,
+            [
+                (1144934, 6, 12000),
+                (1144941, 6, 12000),
+                (1144735, 6, 12000),
+                (1144936, 6, 12000),
+                (191035, 1, 2000)
+            ],
+        );
+
+        tmpdir.close().unwrap();
+    }
+
+    #[tokio::test]
+    async fn verify_parquet_unreliable_upload() {
+        let tmpdir = camino_tempfile::tempdir().unwrap();
+
+        let config = ParquetConfig {
+            propeties: Arc::new(WriterProperties::new()),
+            rows_per_group: 2_000,
+            file_size: 1_000_000,
+            max_duration: time::Duration::from_secs(20 * 60),
+            test_remote_failures: 2,
+        };
+
+        let rx = random_stream(50_000);
+        let file_stats = run_test(tmpdir.path(), config, rx).await;
+
+        assert_eq!(
+            file_stats,
+            [
+                (1029153, 3, 6000),
+                (1029075, 3, 6000),
+                (1029216, 3, 6000),
+                (1029129, 3, 6000),
+                (1029250, 3, 6000),
+                (1029017, 3, 6000),
+                (1029175, 3, 6000),
+                (1029247, 3, 6000),
+                (343124, 1, 2000)
+            ],
+        );
+
+        tmpdir.close().unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn verify_parquet_regular_upload() {
+        let tmpdir = camino_tempfile::tempdir().unwrap();
+
+        let config = ParquetConfig {
+            propeties: Arc::new(WriterProperties::new()),
+            rows_per_group: 2_000,
+            file_size: 1_000_000,
+            max_duration: time::Duration::from_secs(60),
+            test_remote_failures: 2,
+        };
+
+        let (tx, mut rx) = mpsc::unbounded_channel();
+
+        tokio::spawn(async move {
+            for _ in 0..3 {
+                let mut s = random_stream(3000);
+                while let Some(r) = s.next().await {
+                    tx.send(r).unwrap();
+                }
+                time::sleep(time::Duration::from_secs(70)).await
+            }
+        });
+
+        let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
+        let file_stats = run_test(tmpdir.path(), config, rx).await;
+
+        // files are smaller than the size threshold, but they took too long to fill so were flushed early
+        assert_eq!(
+            file_stats,
+            [(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
+        );
+
+        tmpdir.close().unwrap();
+    }
+}
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index f1cb44b1a8..5b2dd7ecfd 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -28,3 +28,37 @@ pub trait UserFacingError: fmt::Display {
         self.to_string()
     }
 }
+
+#[derive(Clone)]
+pub enum ErrorKind {
+    /// Wrong password, unknown endpoint, protocol violation, etc...
+    User,
+
+    /// Network error between user and proxy. Not necessarily user error
+    Disconnect,
+
+    /// Proxy self-imposed rate limits
+    RateLimit,
+
+    /// internal errors
+    Service,
+
+    /// Error communicating with control plane
+    ControlPlane,
+
+    /// Error communicating with compute
+    Compute,
+}
+
+impl ErrorKind {
+    pub fn to_str(&self) -> &'static str {
+        match self {
+            ErrorKind::User => "request failed due to user error",
+            ErrorKind::Disconnect => "client disconnected",
+            ErrorKind::RateLimit => "request cancelled due to rate limit",
+            ErrorKind::Service => "internal service error",
+            ErrorKind::ControlPlane => "non-retryable control plane error",
+            ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)",
+        }
+    }
+}
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 2da1eaf482..87ae8894e1 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -13,6 +13,7 @@ pub mod cancellation;
 pub mod compute;
 pub mod config;
 pub mod console;
+pub mod context;
 pub mod error;
 pub mod http;
 pub mod logging;
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 8e2a6105b1..6e4cbb3f3a 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -115,11 +115,12 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
     .unwrap()
 });
 
+#[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: Option<time::Instant>,
     // accumulated time on the stopwatch
-    accumulated: std::time::Duration,
+    pub accumulated: std::time::Duration,
     // label data
     protocol: &'static str,
     cache_miss: bool,
@@ -160,7 +161,12 @@ impl LatencyTimer {
         self.pool_miss = false;
     }
 
-    pub fn success(mut self) {
+    pub fn success(&mut self) {
+        // stop the stopwatch and record the time that we have accumulated
+        let start = self.start.take().expect("latency timer should be started");
+        self.accumulated += start.elapsed();
+
+        // success
         self.outcome = "success";
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 17e910860c..4aba222082 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,8 +10,9 @@ use crate::{
     compute,
     config::{AuthenticationConfig, ProxyConfig, TlsConfig},
     console::{self, messages::MetricsAuxInfo},
+    context::RequestMonitoring,
     metrics::{
-        LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
+        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
         NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
     },
     protocol2::WithClientIp,
@@ -25,7 +26,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use regex::Regex;
-use std::{net::IpAddr, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
@@ -82,14 +83,16 @@ pub async fn task_main(
                 info!("accepted postgres client connection");
 
                 let mut socket = WithClientIp::new(socket);
-                let mut peer_addr = peer_addr;
-                if let Some(ip) = socket.wait_for_addr().await? {
-                    peer_addr = ip;
-                    tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
+                let mut peer_addr = peer_addr.ip();
+                if let Some(addr) = socket.wait_for_addr().await? {
+                    peer_addr = addr.ip();
+                    tracing::Span::current().record("peer_addr", &tracing::field::display(addr));
                 } else if config.require_client_ip {
                     bail!("missing required client IP");
                 }
 
+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
+
                 socket
                     .inner
                     .set_nodelay(true)
@@ -97,11 +100,10 @@ pub async fn task_main(
 
                 handle_client(
                     config,
+                    &mut ctx,
                     &cancel_map,
-                    session_id,
                     socket,
                     ClientMode::Tcp,
-                    peer_addr.ip(),
                     endpoint_rate_limiter,
                 )
                 .await
@@ -134,13 +136,6 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn protocol_label(&self) -> &'static str {
-        match self {
-            ClientMode::Tcp => "tcp",
-            ClientMode::Websockets { .. } => "ws",
-        }
-    }
-
     fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
@@ -173,19 +168,18 @@ impl ClientMode {
 
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
+    ctx: &mut RequestMonitoring,
     cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
     stream: S,
     mode: ClientMode,
-    peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     info!(
-        protocol = mode.protocol_label(),
+        protocol = ctx.protocol,
         "handling interactive connection from client"
     );
 
-    let proto = mode.protocol_label();
+    let proto = ctx.protocol;
     let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
         .with_label_values(&[proto])
         .guard();
@@ -195,20 +189,23 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let tls = config.tls_config.as_ref();
 
+    let pause = ctx.latency_timer.pause();
     let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
     let (mut stream, params) = match do_handshake.await? {
         Some(x) => x,
         None => return Ok(()), // it's a cancellation request
     };
+    drop(pause);
 
     // Extract credentials which we're going to use for auth.
     let creds = {
         let hostname = mode.hostname(stream.get_ref());
+
         let common_names = tls.and_then(|tls| tls.common_names.clone());
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_names, peer_addr))
+            .map(|_| auth::ClientCredentials::parse(ctx, &params, hostname, common_names))
             .transpose();
 
         match result {
@@ -217,16 +214,19 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
+    ctx.set_endpoint_id(creds.get_endpoint());
+
     let client = Client::new(
         stream,
         creds,
         &params,
-        session_id,
         mode.allow_self_signed_compute(config),
         endpoint_rate_limiter,
     );
     cancel_map
-        .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
+        .with_session(|session| {
+            client.connect_to_db(ctx, session, mode, &config.authentication_config)
+        })
         .await
 }
 
@@ -348,10 +348,13 @@ async fn prepare_client_connection(
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
 pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
     client: impl AsyncRead + AsyncWrite + Unpin,
     compute: impl AsyncRead + AsyncWrite + Unpin,
     aux: MetricsAuxInfo,
 ) -> anyhow::Result<()> {
+    ctx.log();
+
     let usage = USAGE_METRICS.register(Ids {
         endpoint_id: aux.endpoint_id.clone(),
         branch_id: aux.branch_id.clone(),
@@ -397,8 +400,6 @@ struct Client<'a, S> {
     creds: auth::BackendType<'a, auth::ClientCredentials>,
     /// KV-dictionary with PostgreSQL connection params.
     params: &'a StartupMessageParams,
-    /// Unique connection ID.
-    session_id: uuid::Uuid,
     /// Allow self-signed certificates (for testing).
     allow_self_signed_compute: bool,
     /// Rate limiter for endpoints
@@ -411,7 +412,6 @@ impl<'a, S> Client<'a, S> {
         stream: PqStream<Stream<S>>,
         creds: auth::BackendType<'a, auth::ClientCredentials>,
         params: &'a StartupMessageParams,
-        session_id: uuid::Uuid,
         allow_self_signed_compute: bool,
         endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
@@ -419,7 +419,6 @@ impl<'a, S> Client<'a, S> {
             stream,
             creds,
             params,
-            session_id,
             allow_self_signed_compute,
             endpoint_rate_limiter,
         }
@@ -433,6 +432,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
     async fn connect_to_db(
         self,
+        ctx: &mut RequestMonitoring,
         session: cancellation::Session<'_>,
         mode: ClientMode,
         config: &'static AuthenticationConfig,
@@ -441,7 +441,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             mut stream,
             creds,
             params,
-            session_id,
             allow_self_signed_compute,
             endpoint_rate_limiter,
         } = self;
@@ -455,27 +454,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             }
         }
 
-        let proto = mode.protocol_label();
         let extra = console::ConsoleReqExtra {
-            session_id, // aka this connection's id
-            application_name: format!(
-                "{}/{}",
-                params.get("application_name").unwrap_or_default(),
-                proto
-            ),
             options: neon_options(params),
         };
-        let mut latency_timer = LatencyTimer::new(proto);
 
         let user = creds.get_user().to_owned();
         let auth_result = match creds
-            .authenticate(
-                &extra,
-                &mut stream,
-                mode.allow_cleartext(),
-                config,
-                &mut latency_timer,
-            )
+            .authenticate(ctx, &extra, &mut stream, mode.allow_cleartext(), config)
             .await
         {
             Ok(auth_result) => auth_result,
@@ -493,15 +478,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         node_info.allow_self_signed_compute = allow_self_signed_compute;
 
         let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(
-            &TcpMechanism { params, proto },
-            node_info,
-            &extra,
-            &creds,
-            latency_timer,
-        )
-        .or_else(|e| stream.throw_error(e))
-        .await?;
+        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &extra, &creds)
+            .or_else(|e| stream.throw_error(e))
+            .await?;
 
         prepare_client_connection(&node, session, &mut stream).await?;
         // Before proxy passing, forward to compute whatever data is left in the
@@ -510,7 +489,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         // immediately after opening the connection.
         let (stream, read_buf) = stream.into_inner();
         node.stream.write_all(&read_buf).await?;
-        proxy_pass(stream, node.stream, aux).await
+        proxy_pass(ctx, stream, node.stream, aux).await
     }
 }
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 88b0019c49..1da2dee10b 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -2,7 +2,8 @@ use crate::{
     auth,
     compute::{self, PostgresConnection},
     console::{self, errors::WakeComputeError, Api},
-    metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
+    context::RequestMonitoring,
+    metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
     proxy::retry::{retry_after, ShouldRetry},
 };
 use async_trait::async_trait;
@@ -35,15 +36,15 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
 /// Try to connect to the compute node once.
 #[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
 async fn connect_to_compute_once(
+    ctx: &mut RequestMonitoring,
     node_info: &console::CachedNodeInfo,
     timeout: time::Duration,
-    proto: &'static str,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let allow_self_signed_compute = node_info.allow_self_signed_compute;
 
     node_info
         .config
-        .connect(allow_self_signed_compute, timeout, proto)
+        .connect(ctx, allow_self_signed_compute, timeout)
         .await
 }
 
@@ -54,6 +55,7 @@ pub trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
+        ctx: &mut RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -64,7 +66,6 @@ pub trait ConnectMechanism {
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
-    pub proto: &'static str,
 }
 
 #[async_trait]
@@ -75,10 +76,11 @@ impl ConnectMechanism for TcpMechanism<'_> {
 
     async fn connect_once(
         &self,
+        ctx: &mut RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        connect_to_compute_once(node_info, timeout, self.proto).await
+        connect_to_compute_once(ctx, node_info, timeout).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -123,11 +125,11 @@ fn report_error(e: &WakeComputeError, retry: bool) {
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
 pub async fn connect_to_compute<M: ConnectMechanism>(
+    ctx: &mut RequestMonitoring,
     mechanism: &M,
     mut node_info: console::CachedNodeInfo,
     extra: &console::ConsoleReqExtra,
     creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
-    mut latency_timer: LatencyTimer,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
@@ -136,9 +138,12 @@ where
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+    let (config, err) = match mechanism
+        .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
+        .await
+    {
         Ok(res) => {
-            latency_timer.success();
+            ctx.latency_timer.success();
             return Ok(res);
         }
         Err(e) => {
@@ -147,7 +152,7 @@ where
         }
     };
 
-    latency_timer.cache_miss();
+    ctx.latency_timer.cache_miss();
 
     let mut num_retries = 1;
 
@@ -155,9 +160,9 @@ where
     info!("compute node's state has likely changed; requesting a wake-up");
     let node_info = loop {
         let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Console(api, creds) => api.wake_compute(ctx, extra, creds).await,
             #[cfg(feature = "testing")]
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(ctx, extra, creds).await,
             // nothing to do?
             auth::BackendType::Link(_) => return Err(err.into()),
             // test backend
@@ -195,9 +200,12 @@ where
     // * DNS connection settings haven't quite propagated yet
     info!("wake_compute success. attempting to connect");
     loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        match mechanism
+            .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
+            .await
+        {
             Ok(res) => {
-                latency_timer.success();
+                ctx.latency_timer.success();
                 return Ok(res);
             }
             Err(e) => {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3c483c59ee..0957f33a92 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -425,6 +425,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
+        _ctx: &mut RequestMonitoring,
         _node_info: &console::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -491,11 +492,7 @@ fn helper_create_connect_info(
     auth::BackendType<'_, ComputeUserInfo>,
 ) {
     let cache = helper_create_cached_node_info();
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: "TEST".into(),
-        options: vec![],
-    };
+    let extra = console::ConsoleReqExtra { options: vec![] };
     let creds = auth::BackendType::Test(mechanism);
     (cache, extra, creds)
 }
@@ -503,9 +500,10 @@ fn helper_create_connect_info(
 #[tokio::test]
 async fn connect_to_compute_success() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Connect]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap();
     mechanism.verify();
@@ -514,9 +512,10 @@ async fn connect_to_compute_success() {
 #[tokio::test]
 async fn connect_to_compute_retry() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap();
     mechanism.verify();
@@ -526,9 +525,10 @@ async fn connect_to_compute_retry() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -538,9 +538,10 @@ async fn connect_to_compute_non_retry_1() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap();
     mechanism.verify();
@@ -551,12 +552,13 @@ async fn connect_to_compute_non_retry_2() {
 async fn connect_to_compute_non_retry_3() {
     assert_eq!(NUM_RETRIES_CONNECT, 16);
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![
         Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
         Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
     ]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -566,9 +568,10 @@ async fn connect_to_compute_non_retry_3() {
 #[tokio::test]
 async fn wake_retry() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap();
     mechanism.verify();
@@ -578,9 +581,10 @@ async fn wake_retry() {
 #[tokio::test]
 async fn wake_non_retry() {
     use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
     let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
+    connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 07825da8dc..83a9773052 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -17,6 +17,7 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
 
+use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -218,13 +219,14 @@ async fn request_handler(
 
         ws_connections.spawn(
             async move {
+                let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
+
                 if let Err(e) = websocket::serve_websocket(
-                    websocket,
                     config,
+                    &mut ctx,
+                    websocket,
                     &cancel_map,
-                    session_id,
                     host,
-                    peer_addr,
                     endpoint_rate_limiter,
                 )
                 .await
@@ -238,13 +240,14 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response)
     } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
+
         sql_over_http::handle(
+            &config.http_config,
+            &mut ctx,
             request,
             sni_hostname,
             conn_pool,
-            session_id,
-            peer_addr,
-            &config.http_config,
         )
         .await
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c476560215..c9f3fd6a38 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -13,7 +13,7 @@ use pq_proto::StartupMessageParams;
 use prometheus::{exponential_buckets, register_histogram, Histogram};
 use rand::Rng;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
+use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
     task::{ready, Poll},
@@ -28,7 +28,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use crate::{
     auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
     console,
-    metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
+    context::RequestMonitoring,
+    metrics::NUM_DB_CONNECTIONS_GAUGE,
     proxy::{connect_compute::ConnectMechanism, neon_options},
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
@@ -309,13 +310,11 @@ impl GlobalConnPool {
 
     pub async fn get(
         self: &Arc<Self>,
+        ctx: &mut RequestMonitoring,
         conn_info: ConnInfo,
         force_new: bool,
-        session_id: uuid::Uuid,
-        peer_addr: IpAddr,
     ) -> anyhow::Result<Client> {
         let mut client: Option<ClientInner> = None;
-        let mut latency_timer = LatencyTimer::new("http");
 
         let mut hash_valid = false;
         let mut endpoint_pool = Weak::new();
@@ -360,23 +359,21 @@ impl GlobalConnPool {
                 info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
                 connect_to_compute(
                     self.proxy_config,
+                    ctx,
                     &conn_info,
                     conn_id,
-                    session_id,
-                    latency_timer,
-                    peer_addr,
                     endpoint_pool.clone(),
                 )
                 .await
             } else {
                 info!("pool: reusing connection '{conn_info}'");
-                client.session.send(session_id)?;
+                client.session.send(ctx.session_id)?;
                 tracing::Span::current().record(
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
-                latency_timer.pool_hit();
-                latency_timer.success();
+                ctx.latency_timer.pool_hit();
+                ctx.latency_timer.success();
                 return Ok(Client::new(client, conn_info, endpoint_pool).await);
             }
         } else {
@@ -384,11 +381,9 @@ impl GlobalConnPool {
             info!(%conn_id, "pool: opening a new connection '{conn_info}'");
             connect_to_compute(
                 self.proxy_config,
+                ctx,
                 &conn_info,
                 conn_id,
-                session_id,
-                latency_timer,
-                peer_addr,
                 endpoint_pool.clone(),
             )
             .await
@@ -483,7 +478,6 @@ impl GlobalConnPool {
 struct TokioMechanism<'a> {
     pool: Weak<RwLock<EndpointConnPool>>,
     conn_info: &'a ConnInfo,
-    session_id: uuid::Uuid,
     conn_id: uuid::Uuid,
     idle: Duration,
 }
@@ -496,15 +490,16 @@ impl ConnectMechanism for TokioMechanism<'_> {
 
     async fn connect_once(
         &self,
+        ctx: &mut RequestMonitoring,
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
         connect_to_compute_once(
+            ctx,
             node_info,
             self.conn_info,
             timeout,
             self.conn_id,
-            self.session_id,
             self.pool.clone(),
             self.idle,
         )
@@ -520,11 +515,9 @@ impl ConnectMechanism for TokioMechanism<'_> {
 #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
 async fn connect_to_compute(
     config: &config::ProxyConfig,
+    ctx: &mut RequestMonitoring,
     conn_info: &ConnInfo,
     conn_id: uuid::Uuid,
-    session_id: uuid::Uuid,
-    latency_timer: LatencyTimer,
-    peer_addr: IpAddr,
     pool: Weak<RwLock<EndpointConnPool>>,
 ) -> anyhow::Result<ClientInner> {
     let tls = config.tls_config.as_ref();
@@ -536,12 +529,8 @@ async fn connect_to_compute(
         ("application_name", APP_NAME),
         ("options", conn_info.options.as_deref().unwrap_or("")),
     ]);
-    let creds = auth::ClientCredentials::parse(
-        &params,
-        Some(&conn_info.hostname),
-        common_names,
-        peer_addr,
-    )?;
+    let creds =
+        auth::ClientCredentials::parse(ctx, &params, Some(&conn_info.hostname), common_names)?;
 
     let creds =
         ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
@@ -549,48 +538,48 @@ async fn connect_to_compute(
 
     let console_options = neon_options(&params);
 
-    let extra = console::ConsoleReqExtra {
-        session_id: uuid::Uuid::new_v4(),
-        application_name: APP_NAME.to_string(),
-        options: console_options,
-    };
     if !config.disable_ip_check_for_http {
-        let allowed_ips = backend.get_allowed_ips(&extra).await?;
-        if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(auth::AuthError::ip_address_not_allowed().into());
         }
     }
+    let extra = console::ConsoleReqExtra {
+        options: console_options,
+    };
     let node_info = backend
-        .wake_compute(&extra)
+        .wake_compute(ctx, &extra)
         .await?
         .context("missing cache entry from wake_compute")?;
 
+    ctx.set_project(node_info.aux.clone());
+
     crate::proxy::connect_compute::connect_to_compute(
+        ctx,
         &TokioMechanism {
             conn_id,
             conn_info,
-            session_id,
             pool,
             idle: config.http_config.pool_options.idle_timeout,
         },
         node_info,
         &extra,
         &backend,
-        latency_timer,
     )
     .await
 }
 
 async fn connect_to_compute_once(
+    ctx: &mut RequestMonitoring,
     node_info: &console::CachedNodeInfo,
     conn_info: &ConnInfo,
     timeout: time::Duration,
     conn_id: uuid::Uuid,
-    mut session: uuid::Uuid,
     pool: Weak<RwLock<EndpointConnPool>>,
     idle: Duration,
 ) -> Result<ClientInner, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
+    let mut session = ctx.session_id;
 
     let (client, mut connection) = config
         .user(&conn_info.username)
@@ -601,7 +590,7 @@ async fn connect_to_compute_once(
         .await?;
 
     let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
-        .with_label_values(&["http"])
+        .with_label_values(&[ctx.protocol])
         .guard();
 
     tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 2e9d8526d3..70c0343fa3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,4 +1,3 @@
-use std::net::IpAddr;
 use std::sync::Arc;
 
 use anyhow::bail;
@@ -14,6 +13,7 @@ use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
+use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
@@ -29,6 +29,7 @@ use utils::http::error::ApiError;
 use utils::http::json::json_response;
 
 use crate::config::HttpConfig;
+use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 
 use super::conn_pool::ConnInfo;
@@ -121,6 +122,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 }
 
 fn get_conn_info(
+    ctx: &mut RequestMonitoring,
     headers: &HeaderMap,
     sni_hostname: Option<String>,
 ) -> Result<ConnInfo, anyhow::Error> {
@@ -146,10 +148,11 @@ fn get_conn_info(
         .next()
         .ok_or(anyhow::anyhow!("invalid database name"))?;
 
-    let username = connection_url.username();
+    let username = SmolStr::from(connection_url.username());
     if username.is_empty() {
         return Err(anyhow::anyhow!("missing username"));
     }
+    ctx.set_user(username.clone());
 
     let password = connection_url
         .password()
@@ -176,6 +179,9 @@ fn get_conn_info(
         }
     }
 
+    let hostname: SmolStr = hostname.into();
+    ctx.set_endpoint_id(Some(hostname.clone()));
+
     let pairs = connection_url.query_pairs();
 
     let mut options = Option::None;
@@ -188,9 +194,9 @@ fn get_conn_info(
     }
 
     Ok(ConnInfo {
-        username: username.into(),
+        username,
         dbname: dbname.into(),
-        hostname: hostname.into(),
+        hostname,
         password: password.into(),
         options,
     })
@@ -198,23 +204,15 @@ fn get_conn_info(
 
 // TODO: return different http error codes
 pub async fn handle(
+    config: &'static HttpConfig,
+    ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
-    session_id: uuid::Uuid,
-    peer_addr: IpAddr,
-    config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
         config.request_timeout,
-        handle_inner(
-            config,
-            request,
-            sni_hostname,
-            conn_pool,
-            session_id,
-            peer_addr,
-        ),
+        handle_inner(config, ctx, request, sni_hostname, conn_pool),
     )
     .await;
     let mut response = match result {
@@ -297,11 +295,10 @@ pub async fn handle(
 #[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
 async fn handle_inner(
     config: &'static HttpConfig,
+    ctx: &mut RequestMonitoring,
     request: Request<Body>,
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
-    session_id: uuid::Uuid,
-    peer_addr: IpAddr,
 ) -> anyhow::Result<Response<Body>> {
     let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
         .with_label_values(&["http"])
@@ -311,7 +308,7 @@ async fn handle_inner(
     // Determine the destination and connection params
     //
     let headers = request.headers();
-    let conn_info = get_conn_info(headers, sni_hostname)?;
+    let conn_info = get_conn_info(ctx, headers, sni_hostname)?;
 
     // Determine the output options. Default behaviour is 'false'. Anything that is not
     // strictly 'true' assumed to be false.
@@ -340,10 +337,12 @@ async fn handle_inner(
     let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
     let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
 
+    let paused = ctx.latency_timer.pause();
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
+    drop(paused);
 
     // we don't have a streaming request support yet so this is to prevent OOM
     // from a malicious user sending an extremely large request body
@@ -359,9 +358,7 @@ async fn handle_inner(
     let body = hyper::body::to_bytes(request.into_body()).await?;
     let payload: Payload = serde_json::from_slice(&body)?;
 
-    let mut client = conn_pool
-        .get(conn_info, !allow_pool, session_id, peer_addr)
-        .await?;
+    let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;
 
     let mut response = Response::builder()
         .status(StatusCode::OK)
@@ -449,6 +446,7 @@ async fn handle_inner(
             }
         };
 
+    ctx.log();
     let metrics = client.metrics();
 
     // how could this possibly fail
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 071add3bca..a6529c920a 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,6 +1,7 @@
 use crate::{
     cancellation::CancelMap,
     config::ProxyConfig,
+    context::RequestMonitoring,
     error::io_error,
     proxy::{handle_client, ClientMode},
     rate_limiter::EndpointRateLimiter,
@@ -12,7 +13,6 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
 
 use std::{
-    net::IpAddr,
     pin::Pin,
     sync::Arc,
     task::{ready, Context, Poll},
@@ -130,22 +130,20 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 }
 
 pub async fn serve_websocket(
-    websocket: HyperWebsocket,
     config: &'static ProxyConfig,
+    ctx: &mut RequestMonitoring,
+    websocket: HyperWebsocket,
     cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
     hostname: Option<String>,
-    peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     handle_client(
         config,
+        ctx,
         cancel_map,
-        session_id,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
-        peer_addr,
         endpoint_rate_limiter,
     )
     .await?;
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4f13064088..57aa1ef0bc 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -52,6 +52,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128"] }
 once_cell = { version = "1" }
+parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -76,7 +77,7 @@ tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
-uuid = { version = "1", features = ["serde", "v4"] }
+uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
@@ -85,6 +86,7 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
+chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
@@ -92,7 +94,11 @@ libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+num-bigint = { version = "0.4" }
+num-integer = { version = "0.1", features = ["i128"] }
+num-traits = { version = "0.2", features = ["i128"] }
 once_cell = { version = "1" }
+parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
@@ -101,5 +107,8 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+zstd = { version = "0.13" }
+zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
+zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 ### END HAKARI SECTION